Total coverage: 378917 (19%)of 2040666
69 69 69 69 69 27 28 7 22 19 19 19 18 2 19 19 19 19 19 19 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0 /* * drivers/usb/core/file.c * * (C) Copyright Linus Torvalds 1999 * (C) Copyright Johannes Erdfelt 1999-2001 * (C) Copyright Andreas Gal 1999 * (C) Copyright Gregory P. Smith 1999 * (C) Copyright Deti Fliegl 1999 (new USB architecture) * (C) Copyright Randy Dunlap 2000 * (C) Copyright David Brownell 2000-2001 (kernel hotplug, usb_device_id, * more docs, etc) * (C) Copyright Yggdrasil Computing, Inc. 2000 * (usb_device_id matching changes by Adam J. Richter) * (C) Copyright Greg Kroah-Hartman 2002-2003 * * Released under the GPLv2 only. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/usb.h> #include "usb.h" #define MAX_USB_MINORS 256 static const struct file_operations *usb_minors[MAX_USB_MINORS]; static DECLARE_RWSEM(minor_rwsem); static int usb_open(struct inode *inode, struct file *file) { int err = -ENODEV; const struct file_operations *new_fops; down_read(&minor_rwsem); new_fops = fops_get(usb_minors[iminor(inode)]); if (!new_fops) goto done; replace_fops(file, new_fops); /* Curiouser and curiouser... NULL ->open() as "no device" ? */ if (file->f_op->open) err = file->f_op->open(inode, file); done: up_read(&minor_rwsem); return err; } static const struct file_operations usb_fops = { .owner = THIS_MODULE, .open = usb_open, .llseek = noop_llseek, }; static char *usb_devnode(const struct device *dev, umode_t *mode) { struct usb_class_driver *drv; drv = dev_get_drvdata(dev); if (!drv || !drv->devnode) return NULL; return drv->devnode(dev, mode); } const struct class usbmisc_class = { .name = "usbmisc", .devnode = usb_devnode, }; int usb_major_init(void) { int error; error = register_chrdev(USB_MAJOR, "usb", &usb_fops); if (error) printk(KERN_ERR "Unable to get major %d for usb devices\n", USB_MAJOR); return error; } void usb_major_cleanup(void) { unregister_chrdev(USB_MAJOR, "usb"); } /** * usb_register_dev - register a USB device, and ask for a minor number * @intf: pointer to the usb_interface that is being registered * @class_driver: pointer to the usb_class_driver for this device * * This should be called by all USB drivers that use the USB major number. * If CONFIG_USB_DYNAMIC_MINORS is enabled, the minor number will be * dynamically allocated out of the list of available ones. If it is not * enabled, the minor number will be based on the next available free minor, * starting at the class_driver->minor_base. * * This function also creates a usb class device in the sysfs tree. * * usb_deregister_dev() must be called when the driver is done with * the minor numbers given out by this function. * * Return: -EINVAL if something bad happens with trying to register a * device, and 0 on success. */ int usb_register_dev(struct usb_interface *intf, struct usb_class_driver *class_driver) { int retval = 0; int minor_base = class_driver->minor_base; int minor; char name[20]; #ifdef CONFIG_USB_DYNAMIC_MINORS /* * We don't care what the device tries to start at, we want to start * at zero to pack the devices into the smallest available space with * no holes in the minor range. */ minor_base = 0; #endif if (class_driver->fops == NULL) return -EINVAL; if (intf->minor >= 0) return -EADDRINUSE; dev_dbg(&intf->dev, "looking for a minor, starting at %d\n", minor_base); down_write(&minor_rwsem); for (minor = minor_base; minor < MAX_USB_MINORS; ++minor) { if (usb_minors[minor]) continue; usb_minors[minor] = class_driver->fops; intf->minor = minor; break; } if (intf->minor < 0) { up_write(&minor_rwsem); return -EXFULL; } /* create a usb class device for this usb interface */ snprintf(name, sizeof(name), class_driver->name, minor - minor_base); intf->usb_dev = device_create(&usbmisc_class, &intf->dev, MKDEV(USB_MAJOR, minor), class_driver, "%s", kbasename(name)); if (IS_ERR(intf->usb_dev)) { usb_minors[minor] = NULL; intf->minor = -1; retval = PTR_ERR(intf->usb_dev); } up_write(&minor_rwsem); return retval; } EXPORT_SYMBOL_GPL(usb_register_dev); /** * usb_deregister_dev - deregister a USB device's dynamic minor. * @intf: pointer to the usb_interface that is being deregistered * @class_driver: pointer to the usb_class_driver for this device * * Used in conjunction with usb_register_dev(). This function is called * when the USB driver is finished with the minor numbers gotten from a * call to usb_register_dev() (usually when the device is disconnected * from the system.) * * This function also removes the usb class device from the sysfs tree. * * This should be called by all drivers that use the USB major number. */ void usb_deregister_dev(struct usb_interface *intf, struct usb_class_driver *class_driver) { if (intf->minor == -1) return; dev_dbg(&intf->dev, "removing %d minor\n", intf->minor); device_destroy(&usbmisc_class, MKDEV(USB_MAJOR, intf->minor)); down_write(&minor_rwsem); usb_minors[intf->minor] = NULL; up_write(&minor_rwsem); intf->usb_dev = NULL; intf->minor = -1; } EXPORT_SYMBOL_GPL(usb_deregister_dev);
3 49 48 42 49 23 22 22 1 1 37 4 3 4 4 3 4 6 5 6 3 2 3 4 2 2 2 2 1 1 2 2 4 3 2 2 2 2 1 1 2 2 3 5 4 3 4 3 4 1 5 3 2 2 2 1 8 5 5 8 7 8 5 5 5 7 15 12 9 4 4 8 2 2 4 4 6 5 3 5 5 5 5 6 13 69 45 45 64 5 42 40 2 2 4 33 33 28 4 4 8 8 30 31 45 45 11 45 10 44 44 42 42 42 39 33 2 2 2 31 33 33 6 33 39 30 42 32 7 7 42 20 20 39 15 34 10 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 // SPDX-License-Identifier: GPL-2.0-or-later /* * Support for AES-NI and VAES instructions. This file contains glue code. * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> * * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD * interface for 64-bit kernels. * Authors: Adrian Hoban <adrian.hoban@intel.com> * Gabriele Paoloni <gabriele.paoloni@intel.com> * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. * * Copyright 2024 Google LLC */ #include <linux/hardirq.h> #include <linux/types.h> #include <linux/module.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> #include <crypto/b128ops.h> #include <crypto/gcm.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/jump_label.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/static_call.h> #define AESNI_ALIGN 16 #define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) #define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) struct aesni_xts_ctx { struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; static inline void *aes_align_addr(void *addr) { if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) return addr; return PTR_ALIGN(addr, AESNI_ALIGN); } asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len); asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len); asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { return aes_align_addr(raw_ctx); } static inline struct aesni_xts_ctx *aes_xts_ctx(struct crypto_skcipher *tfm) { return aes_align_addr(crypto_skcipher_ctx(tfm)); } static int aes_set_key_common(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len) { int err; if (!crypto_simd_usable()) return aes_expandkey(ctx, in_key, key_len); err = aes_check_keylen(key_len); if (err) return err; kernel_fpu_begin(); aesni_set_key(ctx, in_key, key_len); kernel_fpu_end(); return 0; } static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { return aes_set_key_common(aes_ctx(crypto_tfm_ctx(tfm)), in_key, key_len); } static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); if (!crypto_simd_usable()) { aes_encrypt(ctx, dst, src); } else { kernel_fpu_begin(); aesni_enc(ctx, dst, src); kernel_fpu_end(); } } static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); if (!crypto_simd_usable()) { aes_decrypt(ctx, dst, src); } else { kernel_fpu_begin(); aesni_dec(ctx, dst, src); kernel_fpu_end(); } } static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { return aes_set_key_common(aes_ctx(crypto_skcipher_ctx(tfm)), key, len); } static int ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } return err; } static int cts_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; struct scatterlist *src = req->src, *dst = req->dst; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; int err; skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); if (req->cryptlen <= AES_BLOCK_SIZE) { if (req->cryptlen < AES_BLOCK_SIZE) return -EINVAL; cbc_blocks = 1; } if (cbc_blocks > 0) { skcipher_request_set_crypt(&subreq, req->src, req->dst, cbc_blocks * AES_BLOCK_SIZE, req->iv); err = cbc_encrypt(&subreq); if (err) return err; if (req->cryptlen == AES_BLOCK_SIZE) return 0; dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); if (req->dst != req->src) dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); } /* handle ciphertext stealing */ skcipher_request_set_crypt(&subreq, src, dst, req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, req->iv); err = skcipher_walk_virt(&walk, &subreq, false); if (err) return err; kernel_fpu_begin(); aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, walk.iv); kernel_fpu_end(); return skcipher_walk_done(&walk, 0); } static int cts_cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; struct scatterlist *src = req->src, *dst = req->dst; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; int err; skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); if (req->cryptlen <= AES_BLOCK_SIZE) { if (req->cryptlen < AES_BLOCK_SIZE) return -EINVAL; cbc_blocks = 1; } if (cbc_blocks > 0) { skcipher_request_set_crypt(&subreq, req->src, req->dst, cbc_blocks * AES_BLOCK_SIZE, req->iv); err = cbc_decrypt(&subreq); if (err) return err; if (req->cryptlen == AES_BLOCK_SIZE) return 0; dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); if (req->dst != req->src) dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); } /* handle ciphertext stealing */ skcipher_request_set_crypt(&subreq, src, dst, req->cryptlen - cbc_blocks * AES_BLOCK_SIZE, req->iv); err = skcipher_walk_virt(&walk, &subreq, false); if (err) return err; kernel_fpu_begin(); aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, walk.nbytes, walk.iv); kernel_fpu_end(); return skcipher_walk_done(&walk, 0); } #ifdef CONFIG_X86_64 /* This is the non-AVX version. */ static int ctr_crypt_aesni(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); u8 keystream[AES_BLOCK_SIZE]; struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { kernel_fpu_begin(); if (nbytes & AES_BLOCK_MASK) aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= ~AES_BLOCK_MASK; if (walk.nbytes == walk.total && nbytes > 0) { aesni_enc(ctx, keystream, walk.iv); crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes, walk.src.virt.addr + walk.nbytes - nbytes, keystream, nbytes); crypto_inc(walk.iv, AES_BLOCK_SIZE); nbytes = 0; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } #endif static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int err; err = xts_verify_key(tfm, key, keylen); if (err) return err; keylen /= 2; /* first half of xts-key is for crypt */ err = aes_set_key_common(&ctx->crypt_ctx, key, keylen); if (err) return err; /* second half of xts-key is for tweak */ return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen); } typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); /* This handles cases where the source and/or destination span pages. */ static noinline int xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; struct scatterlist *src, *dst; int err; /* * If the message length isn't divisible by the AES block size, then * separate off the last full block and the partial block. This ensures * that they are processed in the same call to the assembly function, * which is required for ciphertext stealing. */ if (tail) { skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); skcipher_request_set_crypt(&subreq, req->src, req->dst, req->cryptlen - tail - AES_BLOCK_SIZE, req->iv); req = &subreq; } err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { kernel_fpu_begin(); (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv); kernel_fpu_end(); err = skcipher_walk_done(&walk, walk.nbytes & (AES_BLOCK_SIZE - 1)); } if (err || !tail) return err; /* Do ciphertext stealing with the last full block and partial block. */ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); if (req->dst != req->src) dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, req->iv); err = skcipher_walk_virt(&walk, req, false); if (err) return err; kernel_fpu_begin(); (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, walk.nbytes, req->iv); kernel_fpu_end(); return skcipher_walk_done(&walk, 0); } /* __always_inline to avoid indirect call in fastpath */ static __always_inline int xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); if (unlikely(req->cryptlen < AES_BLOCK_SIZE)) return -EINVAL; kernel_fpu_begin(); (*encrypt_iv)(&ctx->tweak_ctx, req->iv); /* * In practice, virtually all XTS plaintexts and ciphertexts are either * 512 or 4096 bytes and do not use multiple scatterlist elements. To * optimize the performance of these cases, the below fast-path handles * single-scatterlist-element messages as efficiently as possible. The * code is 64-bit specific, as it assumes no page mapping is needed. */ if (IS_ENABLED(CONFIG_X86_64) && likely(req->src->length >= req->cryptlen && req->dst->length >= req->cryptlen)) { (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src), sg_virt(req->dst), req->cryptlen, req->iv); kernel_fpu_end(); return 0; } kernel_fpu_end(); return xts_crypt_slowpath(req, crypt_func); } static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]) { aesni_enc(tweak_key, iv, iv); } static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_enc(key, dst, src, len, tweak); } static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_dec(key, dst, src, len, tweak); } static int xts_encrypt_aesni(struct skcipher_request *req) { return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt); } static int xts_decrypt_aesni(struct skcipher_request *req) { return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt); } static struct crypto_alg aesni_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-aesni", .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = AES_MIN_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE, .cia_setkey = aes_set_key, .cia_encrypt = aesni_encrypt, .cia_decrypt = aesni_decrypt } } }; static struct skcipher_alg aesni_skciphers[] = { { .base = { .cra_name = "ecb(aes)", .cra_driver_name = "ecb-aes-aesni", .cra_priority = 400, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base = { .cra_name = "cbc(aes)", .cra_driver_name = "cbc-aes-aesni", .cra_priority = 400, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { .cra_name = "cts(cbc(aes))", .cra_driver_name = "cts-cbc-aes-aesni", .cra_priority = 400, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = cts_cbc_encrypt, .decrypt = cts_cbc_decrypt, #ifdef CONFIG_X86_64 }, { .base = { .cra_name = "ctr(aes)", .cra_driver_name = "ctr-aes-aesni", .cra_priority = 400, .cra_blocksize = 1, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, .encrypt = ctr_crypt_aesni, .decrypt = ctr_crypt_aesni, #endif }, { .base = { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-aesni", .cra_priority = 401, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = XTS_AES_CTX_SIZE, .cra_module = THIS_MODULE, }, .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, .setkey = xts_setkey_aesni, .encrypt = xts_encrypt_aesni, .decrypt = xts_decrypt_aesni, } }; #ifdef CONFIG_X86_64 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); /* __always_inline to avoid indirect call */ static __always_inline int ctr_crypt(struct skcipher_request *req, void (*ctr64_func)(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, const u64 le_ctr[2])) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); unsigned int nbytes, p1_nbytes, nblocks; struct skcipher_walk walk; u64 le_ctr[2]; u64 ctr64; int err; ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); le_ctr[1] = get_unaligned_be64(&req->iv[0]); err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) != 0) { if (nbytes < walk.total) { /* Not the end yet, so keep the length block-aligned. */ nbytes = round_down(nbytes, AES_BLOCK_SIZE); nblocks = nbytes / AES_BLOCK_SIZE; } else { /* It's the end, so include any final partial block. */ nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); } ctr64 += nblocks; kernel_fpu_begin(); if (likely(ctr64 >= nblocks)) { /* The low 64 bits of the counter won't overflow. */ (*ctr64_func)(key, walk.src.virt.addr, walk.dst.virt.addr, nbytes, le_ctr); } else { /* * The low 64 bits of the counter will overflow. The * assembly doesn't handle this case, so split the * operation into two at the point where the overflow * will occur. After the first part, add the carry bit. */ p1_nbytes = min(nbytes, (nblocks - ctr64) * AES_BLOCK_SIZE); (*ctr64_func)(key, walk.src.virt.addr, walk.dst.virt.addr, p1_nbytes, le_ctr); le_ctr[0] = 0; le_ctr[1]++; (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, walk.dst.virt.addr + p1_nbytes, nbytes - p1_nbytes, le_ctr); } kernel_fpu_end(); le_ctr[0] = ctr64; err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } put_unaligned_be64(ctr64, &req->iv[8]); put_unaligned_be64(le_ctr[1], &req->iv[0]); return err; } /* __always_inline to avoid indirect call */ static __always_inline int xctr_crypt(struct skcipher_request *req, void (*xctr_func)(const struct crypto_aes_ctx *key, const u8 *src, u8 *dst, int len, const u8 iv[AES_BLOCK_SIZE], u64 ctr)) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); struct skcipher_walk walk; unsigned int nbytes; u64 ctr = 1; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) != 0) { if (nbytes < walk.total) nbytes = round_down(nbytes, AES_BLOCK_SIZE); kernel_fpu_begin(); (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, nbytes, req->iv, ctr); kernel_fpu_end(); ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } #define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ \ asmlinkage void \ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ asmlinkage void \ aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ \ static int xts_encrypt_##suffix(struct skcipher_request *req) \ { \ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \ } \ \ static int xts_decrypt_##suffix(struct skcipher_request *req) \ { \ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ } \ \ asmlinkage void \ aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ \ static int ctr_crypt_##suffix(struct skcipher_request *req) \ { \ return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ } \ \ asmlinkage void \ aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ const u8 *src, u8 *dst, int len, \ const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ \ static int xctr_crypt_##suffix(struct skcipher_request *req) \ { \ return xctr_crypt(req, aes_xctr_crypt_##suffix); \ } \ \ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .base.cra_name = "xts(aes)", \ .base.cra_driver_name = "xts-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ .base.cra_blocksize = AES_BLOCK_SIZE, \ .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ .min_keysize = 2 * AES_MIN_KEY_SIZE, \ .max_keysize = 2 * AES_MAX_KEY_SIZE, \ .ivsize = AES_BLOCK_SIZE, \ .walksize = 2 * AES_BLOCK_SIZE, \ .setkey = xts_setkey_aesni, \ .encrypt = xts_encrypt_##suffix, \ .decrypt = xts_decrypt_##suffix, \ }, { \ .base.cra_name = "ctr(aes)", \ .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ .min_keysize = AES_MIN_KEY_SIZE, \ .max_keysize = AES_MAX_KEY_SIZE, \ .ivsize = AES_BLOCK_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .setkey = aesni_skcipher_setkey, \ .encrypt = ctr_crypt_##suffix, \ .decrypt = ctr_crypt_##suffix, \ }, { \ .base.cra_name = "xctr(aes)", \ .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ .min_keysize = AES_MIN_KEY_SIZE, \ .max_keysize = AES_MAX_KEY_SIZE, \ .ivsize = AES_BLOCK_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .setkey = aesni_skcipher_setkey, \ .encrypt = xctr_crypt_##suffix, \ .decrypt = xctr_crypt_##suffix, \ }} DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800); /* The common part of the x86_64 AES-GCM key struct */ struct aes_gcm_key { /* Expanded AES key and the AES key length in bytes */ struct crypto_aes_ctx aes_key; /* RFC4106 nonce (used only by the rfc4106 algorithms) */ u32 rfc4106_nonce; }; /* Key struct used by the AES-NI implementations of AES-GCM */ struct aes_gcm_key_aesni { /* * Common part of the key. The assembly code requires 16-byte alignment * for the round keys; we get this by them being located at the start of * the struct and the whole struct being 16-byte aligned. */ struct aes_gcm_key base; /* * Powers of the hash key H^8 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. 16-byte * alignment is required by the assembly code. */ u64 h_powers[8][2] __aligned(16); /* * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd * together. It's used for Karatsuba multiplication. 16-byte alignment * is required by the assembly code. */ u64 h_powers_xored[8] __aligned(16); /* * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte * alignment is required by the assembly code. */ u64 h_times_x64[2] __aligned(16); }; #define AES_GCM_KEY_AESNI(key) \ container_of((key), struct aes_gcm_key_aesni, base) #define AES_GCM_KEY_AESNI_SIZE \ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) /* Key struct used by the VAES + AVX2 implementation of AES-GCM */ struct aes_gcm_key_vaes_avx2 { /* * Common part of the key. The assembly code prefers 16-byte alignment * for the round keys; we get this by them being located at the start of * the struct and the whole struct being 32-byte aligned. */ struct aes_gcm_key base; /* * Powers of the hash key H^8 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. * The assembly code prefers 32-byte alignment for this. */ u64 h_powers[8][2] __aligned(32); /* * Each entry in this array contains the two halves of an entry of * h_powers XOR'd together, in the following order: * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7. * This is used for Karatsuba multiplication. */ u64 h_powers_xored[8]; }; #define AES_GCM_KEY_VAES_AVX2(key) \ container_of((key), struct aes_gcm_key_vaes_avx2, base) #define AES_GCM_KEY_VAES_AVX2_SIZE \ (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1))) /* Key struct used by the VAES + AVX512 implementation of AES-GCM */ struct aes_gcm_key_vaes_avx512 { /* * Common part of the key. The assembly code prefers 16-byte alignment * for the round keys; we get this by them being located at the start of * the struct and the whole struct being 64-byte aligned. */ struct aes_gcm_key base; /* * Powers of the hash key H^16 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. This * array is aligned to a 64-byte boundary to make it naturally aligned * for 512-bit loads, which can improve performance. (The assembly code * doesn't *need* the alignment; this is just an optimization.) */ u64 h_powers[16][2] __aligned(64); /* Three padding blocks required by the assembly code */ u64 padding[3][2]; }; #define AES_GCM_KEY_VAES_AVX512(key) \ container_of((key), struct aes_gcm_key_vaes_avx512, base) #define AES_GCM_KEY_VAES_AVX512_SIZE \ (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1))) /* * These flags are passed to the AES-GCM helper functions to specify the * specific version of AES-GCM (RFC4106 or not), whether it's encryption or * decryption, and which assembly functions should be called. Assembly * functions are selected using flags instead of function pointers to avoid * indirect calls (which are very expensive on x86) regardless of inlining. */ #define FLAG_RFC4106 BIT(0) #define FLAG_ENC BIT(1) #define FLAG_AVX BIT(2) #define FLAG_VAES_AVX2 BIT(3) #define FLAG_VAES_AVX512 BIT(4) static inline struct aes_gcm_key * aes_gcm_key_get(struct crypto_aead *tfm, int flags) { if (flags & FLAG_VAES_AVX512) return PTR_ALIGN(crypto_aead_ctx(tfm), 64); else if (flags & FLAG_VAES_AVX2) return PTR_ALIGN(crypto_aead_ctx(tfm), 32); else return PTR_ALIGN(crypto_aead_ctx(tfm), 16); } asmlinkage void aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); asmlinkage void aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); asmlinkage void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); asmlinkage void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) { if (flags & FLAG_VAES_AVX512) aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key)); else if (flags & FLAG_VAES_AVX2) aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key)); else if (flags & FLAG_AVX) aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); else aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); } asmlinkage void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, u8 ghash_acc[16], const u8 *aad, int aadlen); static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], const u8 *aad, int aadlen, int flags) { if (flags & FLAG_VAES_AVX512) aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ghash_acc, aad, aadlen); else if (flags & FLAG_VAES_AVX2) aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ghash_acc, aad, aadlen); else if (flags & FLAG_AVX) aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, aad, aadlen); else aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, aad, aadlen); } asmlinkage void aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline void aes_gcm_update(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen, int flags) { if (flags & FLAG_ENC) { if (flags & FLAG_VAES_AVX512) aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_VAES_AVX2) aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_AVX) aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); else aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); } else { if (flags & FLAG_VAES_AVX512) aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_VAES_AVX2) aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, src, dst, datalen); else if (flags & FLAG_AVX) aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); else aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); } } asmlinkage void aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline void aes_gcm_enc_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, int flags) { if (flags & FLAG_VAES_AVX512) aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, total_aadlen, total_datalen); else if (flags & FLAG_VAES_AVX2) aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, total_aadlen, total_datalen); else if (flags & FLAG_AVX) aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen); else aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen); } asmlinkage bool __must_check aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, const u32 le_ctr[4], const u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline bool __must_check aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, u8 tag[16], int taglen, int flags) { if (flags & FLAG_VAES_AVX512) return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); else if (flags & FLAG_VAES_AVX2) return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); else if (flags & FLAG_AVX) return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); else return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, total_aadlen, total_datalen, tag, taglen); } /* * This is the Integrity Check Value (aka the authentication tag) length and can * be 8, 12 or 16 bytes long. */ static int common_rfc4106_set_authsize(struct crypto_aead *aead, unsigned int authsize) { switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return 0; } static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { switch (authsize) { case 4: case 8: case 12: case 13: case 14: case 15: case 16: break; default: return -EINVAL; } return 0; } /* * This is the setkey function for the x86_64 implementations of AES-GCM. It * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes * powers of the hash key. * * To comply with the crypto_aead API, this has to be usable in no-SIMD context. * For that reason, this function includes a portable C implementation of the * needed logic. However, the portable C implementation is very slow, taking * about the same time as encrypting 37 KB of data. To be ready for users that * may set a key even somewhat frequently, we therefore also include a SIMD * assembly implementation, expanding the AES key using AES-NI and precomputing * the hash key powers using PCLMULQDQ or VPCLMULQDQ. */ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, unsigned int keylen, int flags) { struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); int err; if (flags & FLAG_RFC4106) { if (keylen < 4) return -EINVAL; keylen -= 4; key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); } /* The assembly code assumes the following offsets. */ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512); BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768); if (likely(crypto_simd_usable())) { err = aes_check_keylen(keylen); if (err) return err; kernel_fpu_begin(); aesni_set_key(&key->aes_key, raw_key, keylen); aes_gcm_precompute(key, flags); kernel_fpu_end(); } else { static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { [0] = 0xc2, [15] = 1 }; static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { [7] = 1, }; be128 h1 = {}; be128 h; int i; err = aes_expandkey(&key->aes_key, raw_key, keylen); if (err) return err; /* Encrypt the all-zeroes block to get the hash key H^1 */ aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); /* Compute H^1 * x^-1 */ h = h1; gf128mul_lle(&h, (const be128 *)x_to_the_minus1); /* Compute the needed key powers */ if (flags & FLAG_VAES_AVX512) { struct aes_gcm_key_vaes_avx512 *k = AES_GCM_KEY_VAES_AVX512(key); for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); k->h_powers[i][1] = be64_to_cpu(h.a); gf128mul_lle(&h, &h1); } memset(k->padding, 0, sizeof(k->padding)); } else if (flags & FLAG_VAES_AVX2) { struct aes_gcm_key_vaes_avx2 *k = AES_GCM_KEY_VAES_AVX2(key); static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 }; for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); k->h_powers[i][1] = be64_to_cpu(h.a); gf128mul_lle(&h, &h1); } for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) { int j = indices[i]; k->h_powers_xored[i] = k->h_powers[j][0] ^ k->h_powers[j][1]; } } else { struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); k->h_powers[i][1] = be64_to_cpu(h.a); k->h_powers_xored[i] = k->h_powers[i][0] ^ k->h_powers[i][1]; gf128mul_lle(&h, &h1); } gf128mul_lle(&h1, (const be128 *)x_to_the_63); k->h_times_x64[0] = be64_to_cpu(h1.b); k->h_times_x64[1] = be64_to_cpu(h1.a); } } return 0; } /* * Initialize @ghash_acc, then pass all @assoclen bytes of associated data * (a.k.a. additional authenticated data) from @sg_src through the GHASH update * assembly function. kernel_fpu_begin() must have already been called. */ static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], struct scatterlist *sg_src, unsigned int assoclen, int flags) { struct scatter_walk walk; /* * The assembly function requires that the length of any non-last * segment of associated data be a multiple of 16 bytes, so this * function does the buffering needed to achieve that. */ unsigned int pos = 0; u8 buf[16]; memset(ghash_acc, 0, 16); scatterwalk_start(&walk, sg_src); while (assoclen) { unsigned int orig_len_this_step = scatterwalk_next( &walk, assoclen); unsigned int len_this_step = orig_len_this_step; unsigned int len; const u8 *src = walk.addr; if (unlikely(pos)) { len = min(len_this_step, 16 - pos); memcpy(&buf[pos], src, len); pos += len; src += len; len_this_step -= len; if (pos < 16) goto next; aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); pos = 0; } len = len_this_step; if (unlikely(assoclen)) /* Not the last segment yet? */ len = round_down(len, 16); aes_gcm_aad_update(key, ghash_acc, src, len, flags); src += len; len_this_step -= len; if (unlikely(len_this_step)) { memcpy(buf, src, len_this_step); pos = len_this_step; } next: scatterwalk_done_src(&walk, orig_len_this_step); if (need_resched()) { kernel_fpu_end(); kernel_fpu_begin(); } assoclen -= orig_len_this_step; } if (unlikely(pos)) aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); } /* __always_inline to optimize out the branches based on @flags */ static __always_inline int gcm_crypt(struct aead_request *req, int flags) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); unsigned int assoclen = req->assoclen; struct skcipher_walk walk; unsigned int nbytes; u8 ghash_acc[16]; /* GHASH accumulator */ u32 le_ctr[4]; /* Counter in little-endian format */ int taglen; int err; /* Initialize the counter and determine the associated data length. */ le_ctr[0] = 2; if (flags & FLAG_RFC4106) { if (unlikely(assoclen != 16 && assoclen != 20)) return -EINVAL; assoclen -= 8; le_ctr[1] = get_unaligned_be32(req->iv + 4); le_ctr[2] = get_unaligned_be32(req->iv + 0); le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ } else { le_ctr[1] = get_unaligned_be32(req->iv + 8); le_ctr[2] = get_unaligned_be32(req->iv + 4); le_ctr[3] = get_unaligned_be32(req->iv + 0); } /* Begin walking through the plaintext or ciphertext. */ if (flags & FLAG_ENC) err = skcipher_walk_aead_encrypt(&walk, req, false); else err = skcipher_walk_aead_decrypt(&walk, req, false); if (err) return err; /* * Since the AES-GCM assembly code requires that at least three assembly * functions be called to process any message (this is needed to support * incremental updates cleanly), to reduce overhead we try to do all * three calls in the same kernel FPU section if possible. We close the * section and start a new one if there are multiple data segments or if * rescheduling is needed while processing the associated data. */ kernel_fpu_begin(); /* Pass the associated data through GHASH. */ gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); /* En/decrypt the data and pass the ciphertext through GHASH. */ while (unlikely((nbytes = walk.nbytes) < walk.total)) { /* * Non-last segment. In this case, the assembly function * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) * bytes. The needed buffering of up to 16 bytes is handled by * the skcipher_walk. Here we just need to round down to a * multiple of 16. */ nbytes = round_down(nbytes, AES_BLOCK_SIZE); aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, walk.dst.virt.addr, nbytes, flags); le_ctr[0] += nbytes / AES_BLOCK_SIZE; kernel_fpu_end(); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); if (err) return err; kernel_fpu_begin(); } /* Last segment: process all remaining data. */ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, walk.dst.virt.addr, nbytes, flags); /* * The low word of the counter isn't used by the finalize, so there's no * need to increment it here. */ /* Finalize */ taglen = crypto_aead_authsize(tfm); if (flags & FLAG_ENC) { /* Finish computing the auth tag. */ aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, req->cryptlen, flags); /* Store the computed auth tag in the dst scatterlist. */ scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + req->cryptlen, taglen, 1); } else { unsigned int datalen = req->cryptlen - taglen; u8 tag[16]; /* Get the transmitted auth tag from the src scatterlist. */ scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, taglen, 0); /* * Finish computing the auth tag and compare it to the * transmitted one. The assembly function does the actual tag * comparison. Here, just check the boolean result. */ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, datalen, tag, taglen, flags)) err = -EBADMSG; } kernel_fpu_end(); if (nbytes) skcipher_walk_done(&walk, 0); return err; } #define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ ctxsize, priority) \ \ static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ unsigned int keylen) \ { \ return gcm_setkey(tfm, raw_key, keylen, (flags)); \ } \ \ static int gcm_encrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags) | FLAG_ENC); \ } \ \ static int gcm_decrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags)); \ } \ \ static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ unsigned int keylen) \ { \ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ } \ \ static int rfc4106_encrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ } \ \ static int rfc4106_decrypt_##suffix(struct aead_request *req) \ { \ return gcm_crypt(req, (flags) | FLAG_RFC4106); \ } \ \ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ .setkey = gcm_setkey_##suffix, \ .setauthsize = generic_gcmaes_set_authsize, \ .encrypt = gcm_encrypt_##suffix, \ .decrypt = gcm_decrypt_##suffix, \ .ivsize = GCM_AES_IV_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ .cra_name = "gcm(aes)", \ .cra_driver_name = generic_driver_name, \ .cra_priority = (priority), \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ }, \ }, { \ .setkey = rfc4106_setkey_##suffix, \ .setauthsize = common_rfc4106_set_authsize, \ .encrypt = rfc4106_encrypt_##suffix, \ .decrypt = rfc4106_decrypt_##suffix, \ .ivsize = GCM_RFC4106_IV_SIZE, \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ .cra_name = "rfc4106(gcm(aes))", \ .cra_driver_name = rfc_driver_name, \ .cra_priority = (priority), \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ }, \ } } /* aes_gcm_algs_aesni */ DEFINE_GCM_ALGS(aesni, /* no flags */ 0, "generic-gcm-aesni", "rfc4106-gcm-aesni", AES_GCM_KEY_AESNI_SIZE, 400); /* aes_gcm_algs_aesni_avx */ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", AES_GCM_KEY_AESNI_SIZE, 500); /* aes_gcm_algs_vaes_avx2 */ DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2, "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2", AES_GCM_KEY_VAES_AVX2_SIZE, 600); /* aes_gcm_algs_vaes_avx512 */ DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512, "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512", AES_GCM_KEY_VAES_AVX512_SIZE, 800); static int __init register_avx_algs(void) { int err; if (!boot_cpu_has(X86_FEATURE_AVX)) return 0; err = crypto_register_skciphers(skcipher_algs_aesni_avx, ARRAY_SIZE(skcipher_algs_aesni_avx)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_aesni_avx, ARRAY_SIZE(aes_gcm_algs_aesni_avx)); if (err) return err; /* * Note: not all the algorithms registered below actually require * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. * Similarly, the assembler support was added at about the same time. * For simplicity, just always check for VAES and VPCLMULQDQ together. */ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_VAES) || !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return 0; err = crypto_register_skciphers(skcipher_algs_vaes_avx2, ARRAY_SIZE(skcipher_algs_vaes_avx2)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_vaes_avx2, ARRAY_SIZE(aes_gcm_algs_vaes_avx2)); if (err) return err; if (!boot_cpu_has(X86_FEATURE_AVX512BW) || !boot_cpu_has(X86_FEATURE_AVX512VL) || !boot_cpu_has(X86_FEATURE_BMI2) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL)) return 0; if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i; for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) skcipher_algs_vaes_avx512[i].base.cra_priority = 1; for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++) aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1; } err = crypto_register_skciphers(skcipher_algs_vaes_avx512, ARRAY_SIZE(skcipher_algs_vaes_avx512)); if (err) return err; err = crypto_register_aeads(aes_gcm_algs_vaes_avx512, ARRAY_SIZE(aes_gcm_algs_vaes_avx512)); if (err) return err; return 0; } #define unregister_skciphers(A) \ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ crypto_unregister_skciphers((A), ARRAY_SIZE(A)) #define unregister_aeads(A) \ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ crypto_unregister_aeads((A), ARRAY_SIZE(A)) static void unregister_avx_algs(void) { unregister_skciphers(skcipher_algs_aesni_avx); unregister_aeads(aes_gcm_algs_aesni_avx); unregister_skciphers(skcipher_algs_vaes_avx2); unregister_skciphers(skcipher_algs_vaes_avx512); unregister_aeads(aes_gcm_algs_vaes_avx2); unregister_aeads(aes_gcm_algs_vaes_avx512); } #else /* CONFIG_X86_64 */ static struct aead_alg aes_gcm_algs_aesni[0]; static int __init register_avx_algs(void) { return 0; } static void unregister_avx_algs(void) { } #endif /* !CONFIG_X86_64 */ static const struct x86_cpu_id aesni_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); static int __init aesni_init(void) { int err; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; err = crypto_register_alg(&aesni_cipher_alg); if (err) return err; err = crypto_register_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers)); if (err) goto unregister_cipher; err = crypto_register_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni)); if (err) goto unregister_skciphers; err = register_avx_algs(); if (err) goto unregister_avx; return 0; unregister_avx: unregister_avx_algs(); crypto_unregister_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni)); unregister_skciphers: crypto_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers)); unregister_cipher: crypto_unregister_alg(&aesni_cipher_alg); return err; } static void __exit aesni_exit(void) { crypto_unregister_aeads(aes_gcm_algs_aesni, ARRAY_SIZE(aes_gcm_algs_aesni)); crypto_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers)); crypto_unregister_alg(&aesni_cipher_alg); unregister_avx_algs(); } module_init(aesni_init); module_exit(aesni_exit); MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes");
62 16 16 173 53 2 82 18 101 11 20 101 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_AEAD_H #define _CRYPTO_AEAD_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/types.h> /** * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API * * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD * (listed as type "aead" in /proc/crypto) * * The most prominent examples for this type of encryption is GCM and CCM. * However, the kernel supports other types of AEAD ciphers which are defined * with the following cipher string: * * authenc(keyed message digest, block cipher) * * For example: authenc(hmac(sha256), cbc(aes)) * * The example code provided for the symmetric key cipher operation applies * here as well. Naturally all *skcipher* symbols must be exchanged the *aead* * pendants discussed in the following. In addition, for the AEAD operation, * the aead_request_set_ad function must be used to set the pointer to the * associated data memory location before performing the encryption or * decryption operation. Another deviation from the asynchronous block cipher * operation is that the caller should explicitly check for -EBADMSG of the * crypto_aead_decrypt. That error indicates an authentication error, i.e. * a breach in the integrity of the message. In essence, that -EBADMSG error * code is the key bonus an AEAD cipher has over "standard" block chaining * modes. * * Memory Structure: * * The source scatterlist must contain the concatenation of * associated data || plaintext or ciphertext. * * The destination scatterlist has the same layout, except that the plaintext * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size * during encryption (resp. decryption). The authentication tag is generated * during the encryption operation and appended to the ciphertext. During * decryption, the authentication tag is consumed along with the ciphertext and * used to verify the integrity of the plaintext and the associated data. * * In-place encryption/decryption is enabled by using the same scatterlist * pointer for both the source and destination. * * Even in the out-of-place case, space must be reserved in the destination for * the associated data, even though it won't be written to. This makes the * in-place and out-of-place cases more consistent. It is permissible for the * "destination" associated data to alias the "source" associated data. * * As with the other scatterlist crypto APIs, zero-length scatterlist elements * are not allowed in the used part of the scatterlist. Thus, if there is no * associated data, the first element must point to the plaintext/ciphertext. * * To meet the needs of IPsec, a special quirk applies to rfc4106, rfc4309, * rfc4543, and rfc7539esp ciphers. For these ciphers, the final 'ivsize' bytes * of the associated data buffer must contain a second copy of the IV. This is * in addition to the copy passed to aead_request_set_crypt(). These two IV * copies must not differ; different implementations of the same algorithm may * behave differently in that case. Note that the algorithm might not actually * treat the IV as associated data; nevertheless the length passed to * aead_request_set_ad() must include it. */ struct crypto_aead; struct scatterlist; /** * struct aead_request - AEAD request * @base: Common attributes for async crypto requests * @assoclen: Length in bytes of associated data for authentication * @cryptlen: Length of data to be encrypted or decrypted * @iv: Initialisation vector * @src: Source data * @dst: Destination data * @__ctx: Start of private context data */ struct aead_request { struct crypto_async_request base; unsigned int assoclen; unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct aead_alg - AEAD cipher definition * @maxauthsize: Set the maximum authentication tag size supported by the * transformation. A transformation may support smaller tag sizes. * As the authentication tag is a message digest to ensure the * integrity of the encrypted data, a consumer typically wants the * largest authentication tag possible as defined by this * variable. * @setauthsize: Set authentication size for the AEAD transformation. This * function is used to specify the consumer requested size of the * authentication tag to be either generated by the transformation * during encryption or the size of the authentication tag to be * supplied during the decryption operation. This function is also * responsible for checking the authentication tag size for * validity. * @setkey: see struct skcipher_alg * @encrypt: see struct skcipher_alg * @decrypt: see struct skcipher_alg * @ivsize: see struct skcipher_alg * @chunksize: see struct skcipher_alg * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @base: Definition of a generic crypto cipher algorithm. * * All fields except @ivsize is mandatory and must be filled. */ struct aead_alg { int (*setkey)(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize); int (*encrypt)(struct aead_request *req); int (*decrypt)(struct aead_request *req); int (*init)(struct crypto_aead *tfm); void (*exit)(struct crypto_aead *tfm); unsigned int ivsize; unsigned int maxauthsize; unsigned int chunksize; struct crypto_alg base; }; struct crypto_aead { unsigned int authsize; unsigned int reqsize; struct crypto_tfm base; }; struct crypto_sync_aead { struct crypto_aead base; }; #define MAX_SYNC_AEAD_REQSIZE 384 #define SYNC_AEAD_REQUEST_ON_STACK(name, _tfm) \ char __##name##_desc[sizeof(struct aead_request) + \ MAX_SYNC_AEAD_REQSIZE \ ] CRYPTO_MINALIGN_ATTR; \ struct aead_request *name = \ (((struct aead_request *)__##name##_desc)->base.tfm = \ crypto_sync_aead_tfm((_tfm)), \ (void *)__##name##_desc) static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_aead, base); } /** * crypto_alloc_aead() - allocate AEAD cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * AEAD cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an AEAD. The returned struct * crypto_aead is the cipher handle that is required for any subsequent * API invocation for that AEAD. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask); struct crypto_sync_aead *crypto_alloc_sync_aead(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_sync_aead_tfm(struct crypto_sync_aead *tfm) { return crypto_aead_tfm(&tfm->base); } /** * crypto_free_aead() - zeroize and free aead handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_aead(struct crypto_aead *tfm) { crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm)); } static inline void crypto_free_sync_aead(struct crypto_sync_aead *tfm) { crypto_free_aead(&tfm->base); } /** * crypto_has_aead() - Search for the availability of an aead. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * aead * @type: specifies the type of the aead * @mask: specifies the mask for the aead * * Return: true when the aead is known to the kernel crypto API; false * otherwise */ int crypto_has_aead(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_aead_driver_name(struct crypto_aead *tfm) { return crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); } static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm) { return container_of(crypto_aead_tfm(tfm)->__crt_alg, struct aead_alg, base); } static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg) { return alg->ivsize; } /** * crypto_aead_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the aead referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm) { return crypto_aead_alg_ivsize(crypto_aead_alg(tfm)); } static inline unsigned int crypto_sync_aead_ivsize(struct crypto_sync_aead *tfm) { return crypto_aead_ivsize(&tfm->base); } /** * crypto_aead_authsize() - obtain maximum authentication data size * @tfm: cipher handle * * The maximum size of the authentication data for the AEAD cipher referenced * by the AEAD cipher handle is returned. The authentication data size may be * zero if the cipher implements a hard-coded maximum. * * The authentication data may also be known as "tag value". * * Return: authentication data size / tag size in bytes */ static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm) { return tfm->authsize; } static inline unsigned int crypto_sync_aead_authsize(struct crypto_sync_aead *tfm) { return crypto_aead_authsize(&tfm->base); } static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg) { return alg->maxauthsize; } static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead) { return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead)); } static inline unsigned int crypto_sync_aead_maxauthsize(struct crypto_sync_aead *tfm) { return crypto_aead_maxauthsize(&tfm->base); } /** * crypto_aead_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the AEAD referenced with the cipher handle is returned. * The caller may use that information to allocate appropriate memory for the * data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm) { return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm)); } static inline unsigned int crypto_sync_aead_blocksize(struct crypto_sync_aead *tfm) { return crypto_aead_blocksize(&tfm->base); } static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm) { return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm)); } static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm) { return crypto_tfm_get_flags(crypto_aead_tfm(tfm)); } static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags); } static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags); } static inline u32 crypto_sync_aead_get_flags(struct crypto_sync_aead *tfm) { return crypto_aead_get_flags(&tfm->base); } static inline void crypto_sync_aead_set_flags(struct crypto_sync_aead *tfm, u32 flags) { crypto_aead_set_flags(&tfm->base, flags); } static inline void crypto_sync_aead_clear_flags(struct crypto_sync_aead *tfm, u32 flags) { crypto_aead_clear_flags(&tfm->base, flags); } /** * crypto_aead_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the AEAD referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); static inline int crypto_sync_aead_setkey(struct crypto_sync_aead *tfm, const u8 *key, unsigned int keylen) { return crypto_aead_setkey(&tfm->base, key, keylen); } /** * crypto_aead_setauthsize() - set authentication data size * @tfm: cipher handle * @authsize: size of the authentication data / tag in bytes * * Set the authentication data size / tag size. AEAD requires an authentication * tag (or MAC) in addition to the associated data. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize); static inline int crypto_sync_aead_setauthsize(struct crypto_sync_aead *tfm, unsigned int authsize) { return crypto_aead_setauthsize(&tfm->base, authsize); } static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) { return __crypto_aead_cast(req->base.tfm); } static inline struct crypto_sync_aead *crypto_sync_aead_reqtfm(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); return container_of(tfm, struct crypto_sync_aead, base); } /** * crypto_aead_encrypt() - encrypt plaintext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The encryption operation creates the authentication data / * tag. That data is concatenated with the created ciphertext. * The ciphertext memory size is therefore the given number of * block cipher blocks + the size defined by the * crypto_aead_setauthsize invocation. The caller must ensure * that sufficient memory is available for the ciphertext and * the authentication tag. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_aead_encrypt(struct aead_request *req); /** * crypto_aead_decrypt() - decrypt ciphertext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the * authentication data / tag. That authentication data / tag * must have the size defined by the crypto_aead_setauthsize * invocation. * * * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD * cipher operation performs the authentication of the data during the * decryption operation. Therefore, the function returns this error if * the authentication of the ciphertext was unsuccessful (i.e. the * integrity of the ciphertext or the associated data was violated); * < 0 if an error occurred. */ int crypto_aead_decrypt(struct aead_request *req); /** * DOC: Asynchronous AEAD Request Handle * * The aead_request data structure contains all pointers to data required for * the AEAD cipher operation. This includes the cipher handle (which can be * used by multiple aead_request instances), pointer to plaintext and * ciphertext, asynchronous callback function, etc. It acts as a handle to the * aead_request_* API calls in a similar way as AEAD handle to the * crypto_aead_* API calls. */ /** * crypto_aead_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm) { return tfm->reqsize; } /** * aead_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing aead handle in the request * data structure with a different one. */ static inline void aead_request_set_tfm(struct aead_request *req, struct crypto_aead *tfm) { req->base.tfm = crypto_aead_tfm(tfm); } static inline void aead_request_set_sync_tfm(struct aead_request *req, struct crypto_sync_aead *tfm) { aead_request_set_tfm(req, &tfm->base); } /** * aead_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the AEAD * encrypt and decrypt API calls. During the allocation, the provided aead * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm, gfp_t gfp) { struct aead_request *req; req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp); if (likely(req)) aead_request_set_tfm(req, tfm); return req; } /** * aead_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void aead_request_free(struct aead_request *req) { kfree_sensitive(req); } /** * aead_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * Setting the callback function that is triggered once the cipher operation * completes * * The callback function is registered with the aead_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void aead_request_set_callback(struct aead_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * aead_request_set_crypt - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_aead_ivsize() * * Setting the source data and destination data scatter / gather lists which * hold the associated data concatenated with the plaintext or ciphertext. See * below for the authentication tag. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. * * The memory structure for cipher operation has the following structure: * * - AEAD encryption input: assoc data || plaintext * - AEAD encryption output: assoc data || ciphertext || auth tag * - AEAD decryption input: assoc data || ciphertext || auth tag * - AEAD decryption output: assoc data || plaintext * * Albeit the kernel requires the presence of the AAD buffer, however, * the kernel does not fill the AAD buffer in the output case. If the * caller wants to have that data buffer filled, the caller must either * use an in-place cipher operation (i.e. same memory location for * input/output memory location). */ static inline void aead_request_set_crypt(struct aead_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, u8 *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } /** * aead_request_set_ad - set associated data information * @req: request handle * @assoclen: number of bytes in associated data * * Setting the AD information. This function sets the length of * the associated data. */ static inline void aead_request_set_ad(struct aead_request *req, unsigned int assoclen) { req->assoclen = assoclen; } #endif /* _CRYPTO_AEAD_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 /* * linux/fs/hfs/hfs_fs.h * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. */ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/workqueue.h> #include <asm/byteorder.h> #include <linux/uaccess.h> #include "hfs.h" /* * struct hfs_inode_info * * The HFS-specific part of a Linux (struct inode) */ struct hfs_inode_info { atomic_t opencnt; unsigned int flags; /* to deal with localtime ugliness */ int tz_secondswest; struct hfs_cat_key cat_key; struct list_head open_dir_list; spinlock_t open_dir_lock; struct inode *rsrc_inode; struct mutex extents_lock; u16 alloc_blocks, clump_blocks; sector_t fs_blocks; /* Allocation extents from catlog record or volume header */ hfs_extent_rec first_extents; u16 first_blocks; hfs_extent_rec cached_extents; u16 cached_start, cached_blocks; loff_t phys_size; struct inode vfs_inode; }; #define HFS_FLG_RSRC 0x0001 #define HFS_FLG_EXT_DIRTY 0x0002 #define HFS_FLG_EXT_NEW 0x0004 #define HFS_IS_RSRC(inode) (HFS_I(inode)->flags & HFS_FLG_RSRC) /* * struct hfs_sb_info * * The HFS-specific part of a Linux (struct super_block) */ struct hfs_sb_info { struct buffer_head *mdb_bh; /* The hfs_buffer holding the real superblock (aka VIB or MDB) */ struct hfs_mdb *mdb; struct buffer_head *alt_mdb_bh; /* The hfs_buffer holding the alternate superblock */ struct hfs_mdb *alt_mdb; __be32 *bitmap; /* The page holding the allocation bitmap */ struct hfs_btree *ext_tree; /* Information about the extents b-tree */ struct hfs_btree *cat_tree; /* Information about the catalog b-tree */ atomic64_t file_count; /* The number of regular files in the filesystem */ atomic64_t folder_count; /* The number of directories in the filesystem */ atomic64_t next_id; /* The next available file id number */ u32 clumpablks; /* The number of allocation blocks to try to add when extending a file */ u32 fs_start; /* The first 512-byte block represented in the bitmap */ u32 part_start; u16 root_files; /* The number of regular (non-directory) files in the root directory */ u16 root_dirs; /* The number of directories in the root directory */ u16 fs_ablocks; /* The number of allocation blocks in the filesystem */ u16 free_ablocks; /* the number of unused allocation blocks in the filesystem */ u32 alloc_blksz; /* The size of an "allocation block" */ int s_quiet; /* Silent failure when changing owner or mode? */ __be32 s_type; /* Type for new files */ __be32 s_creator; /* Creator for new files */ umode_t s_file_umask; /* The umask applied to the permissions on all files */ umode_t s_dir_umask; /* The umask applied to the permissions on all dirs */ kuid_t s_uid; /* The uid of all files */ kgid_t s_gid; /* The gid of all files */ int session, part; struct nls_table *nls_io, *nls_disk; struct mutex bitmap_lock; unsigned long flags; u16 blockoffset; int fs_div; struct super_block *sb; int work_queued; /* non-zero delayed work is queued */ struct delayed_work mdb_work; /* MDB flush delayed work */ spinlock_t work_lock; /* protects mdb_work and work_queued */ }; #define HFS_FLG_BITMAP_DIRTY 0 #define HFS_FLG_MDB_DIRTY 1 #define HFS_FLG_ALT_MDB_DIRTY 2 /* bitmap.c */ extern u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits); extern int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count); /* catalog.c */ extern int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2); struct hfs_find_data; extern int hfs_cat_find_brec(struct super_block *sb, u32 cnid, struct hfs_find_data *fd); extern int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode); extern int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str); extern int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, struct inode *dst_dir, const struct qstr *dst_name); extern void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, const struct qstr *name); /* dir.c */ extern const struct file_operations hfs_dir_operations; extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2); extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off); extern int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type); extern int hfs_ext_write_extent(struct inode *inode); extern int hfs_extend_file(struct inode *inode); extern void hfs_file_truncate(struct inode *inode); extern int hfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); /* inode.c */ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode); extern void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, __be32 *log_size, __be32 *phys_size); extern int hfs_write_inode(struct inode *inode, struct writeback_control *wbc); extern int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 __log_size, __be32 phys_size, u32 clump_size); extern struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec); extern void hfs_evict_inode(struct inode *inode); extern void hfs_delete_inode(struct inode *inode); /* attr.c */ extern const struct xattr_handler * const hfs_xattr_handlers[]; /* mdb.c */ extern int hfs_mdb_get(struct super_block *sb); extern void hfs_mdb_commit(struct super_block *sb); extern void hfs_mdb_close(struct super_block *sb); extern void hfs_mdb_put(struct super_block *sb); /* part_tbl.c */ extern int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; extern int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this); extern int hfs_strcmp(const unsigned char *s1, unsigned int len1, const unsigned char *s2, unsigned int len2); extern int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* trans.c */ extern void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr *in); extern int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in); /* super.c */ extern void hfs_mark_mdb_dirty(struct super_block *sb); /* * There are two time systems. Both are based on seconds since * a particular time/date. * Unix: signed little-endian since 00:00 GMT, Jan. 1, 1970 * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 * * HFS implementations are highly inconsistent, this one matches the * traditional behavior of 64-bit Linux, giving the most useful * time range between 1970 and 2106, by treating any on-disk timestamp * under HFS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106. */ #define HFS_UTC_OFFSET 2082844800U static inline time64_t __hfs_m_to_utime(__be32 mt) { time64_t ut = (u32)(be32_to_cpu(mt) - HFS_UTC_OFFSET); return ut + sys_tz.tz_minuteswest * 60; } static inline __be32 __hfs_u_to_mtime(time64_t ut) { ut -= sys_tz.tz_minuteswest * 60; return cpu_to_be32(lower_32_bits(ut) + HFS_UTC_OFFSET); } #define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec64){ .tv_sec = __hfs_m_to_utime(time) } #define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) #define hfs_mtime() __hfs_u_to_mtime(ktime_get_real_seconds()) static inline const char *hfs_mdb_name(struct super_block *sb) { return sb->s_id; } static inline void hfs_bitmap_dirty(struct super_block *sb) { set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); } #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ sector_t __block; \ loff_t __start; \ int __offset; \ \ __start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\ __block = __start >> (sb)->s_blocksize_bits; \ __offset = __start & ((sb)->s_blocksize - 1); \ __bh = sb_bread((sb), __block); \ if (likely(__bh != NULL)) \ data = (void *)(__bh->b_data + __offset);\ else \ data = NULL; \ __bh; \ }) #endif
3 2 3 1 3 2 3 1 1 3 2 3 1 3 1 19 17 19 21 20 20 21 3 567 401 517 54 35 566 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 /* * \file drm_ioc32.c * * 32-bit ioctl compatibility routines for the DRM. * * \author Paul Mackerras <paulus@samba.org> * * Copyright (C) Paul Mackerras 2005. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include <linux/compat.h> #include <linux/ratelimit.h> #include <linux/export.h> #include <drm/drm_device.h> #include <drm/drm_file.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_IOCTL_VERSION32 DRM_IOWR(0x00, drm_version32_t) #define DRM_IOCTL_GET_UNIQUE32 DRM_IOWR(0x01, drm_unique32_t) #define DRM_IOCTL_GET_MAP32 DRM_IOWR(0x04, drm_map32_t) #define DRM_IOCTL_GET_CLIENT32 DRM_IOWR(0x05, drm_client32_t) #define DRM_IOCTL_GET_STATS32 DRM_IOR( 0x06, drm_stats32_t) #define DRM_IOCTL_SET_UNIQUE32 DRM_IOW( 0x10, drm_unique32_t) #define DRM_IOCTL_ADD_MAP32 DRM_IOWR(0x15, drm_map32_t) #define DRM_IOCTL_ADD_BUFS32 DRM_IOWR(0x16, drm_buf_desc32_t) #define DRM_IOCTL_MARK_BUFS32 DRM_IOW( 0x17, drm_buf_desc32_t) #define DRM_IOCTL_INFO_BUFS32 DRM_IOWR(0x18, drm_buf_info32_t) #define DRM_IOCTL_MAP_BUFS32 DRM_IOWR(0x19, drm_buf_map32_t) #define DRM_IOCTL_FREE_BUFS32 DRM_IOW( 0x1a, drm_buf_free32_t) #define DRM_IOCTL_RM_MAP32 DRM_IOW( 0x1b, drm_map32_t) #define DRM_IOCTL_SET_SAREA_CTX32 DRM_IOW( 0x1c, drm_ctx_priv_map32_t) #define DRM_IOCTL_GET_SAREA_CTX32 DRM_IOWR(0x1d, drm_ctx_priv_map32_t) #define DRM_IOCTL_RES_CTX32 DRM_IOWR(0x26, drm_ctx_res32_t) #define DRM_IOCTL_DMA32 DRM_IOWR(0x29, drm_dma32_t) #define DRM_IOCTL_AGP_ENABLE32 DRM_IOW( 0x32, drm_agp_mode32_t) #define DRM_IOCTL_AGP_INFO32 DRM_IOR( 0x33, drm_agp_info32_t) #define DRM_IOCTL_AGP_ALLOC32 DRM_IOWR(0x34, drm_agp_buffer32_t) #define DRM_IOCTL_AGP_FREE32 DRM_IOW( 0x35, drm_agp_buffer32_t) #define DRM_IOCTL_AGP_BIND32 DRM_IOW( 0x36, drm_agp_binding32_t) #define DRM_IOCTL_AGP_UNBIND32 DRM_IOW( 0x37, drm_agp_binding32_t) #define DRM_IOCTL_SG_ALLOC32 DRM_IOW( 0x38, drm_scatter_gather32_t) #define DRM_IOCTL_SG_FREE32 DRM_IOW( 0x39, drm_scatter_gather32_t) #define DRM_IOCTL_UPDATE_DRAW32 DRM_IOW( 0x3f, drm_update_draw32_t) #define DRM_IOCTL_WAIT_VBLANK32 DRM_IOWR(0x3a, drm_wait_vblank32_t) #define DRM_IOCTL_MODE_ADDFB232 DRM_IOWR(0xb8, drm_mode_fb_cmd232_t) typedef struct drm_version_32 { int version_major; /* Major version */ int version_minor; /* Minor version */ int version_patchlevel; /* Patch level */ u32 name_len; /* Length of name buffer */ u32 name; /* Name of driver */ u32 date_len; /* Length of date buffer */ u32 date; /* User-space buffer to hold date */ u32 desc_len; /* Length of desc buffer */ u32 desc; /* User-space buffer to hold desc */ } drm_version32_t; static int compat_drm_version(struct file *file, unsigned int cmd, unsigned long arg) { drm_version32_t v32; struct drm_version v; int err; if (copy_from_user(&v32, (void __user *)arg, sizeof(v32))) return -EFAULT; memset(&v, 0, sizeof(v)); v = (struct drm_version) { .name_len = v32.name_len, .name = compat_ptr(v32.name), .date_len = v32.date_len, .date = compat_ptr(v32.date), .desc_len = v32.desc_len, .desc = compat_ptr(v32.desc), }; err = drm_ioctl_kernel(file, drm_version, &v, DRM_RENDER_ALLOW); if (err) return err; v32.version_major = v.version_major; v32.version_minor = v.version_minor; v32.version_patchlevel = v.version_patchlevel; v32.name_len = v.name_len; v32.date_len = v.date_len; v32.desc_len = v.desc_len; if (copy_to_user((void __user *)arg, &v32, sizeof(v32))) return -EFAULT; return 0; } typedef struct drm_unique32 { u32 unique_len; /* Length of unique */ u32 unique; /* Unique name for driver instantiation */ } drm_unique32_t; static int compat_drm_getunique(struct file *file, unsigned int cmd, unsigned long arg) { drm_unique32_t uq32; struct drm_unique uq; int err; if (copy_from_user(&uq32, (void __user *)arg, sizeof(uq32))) return -EFAULT; memset(&uq, 0, sizeof(uq)); uq = (struct drm_unique){ .unique_len = uq32.unique_len, .unique = compat_ptr(uq32.unique), }; err = drm_ioctl_kernel(file, drm_getunique, &uq, 0); if (err) return err; uq32.unique_len = uq.unique_len; if (copy_to_user((void __user *)arg, &uq32, sizeof(uq32))) return -EFAULT; return 0; } static int compat_drm_setunique(struct file *file, unsigned int cmd, unsigned long arg) { /* it's dead */ return -EINVAL; } typedef struct drm_client32 { int idx; /* Which client desired? */ int auth; /* Is client authenticated? */ u32 pid; /* Process ID */ u32 uid; /* User ID */ u32 magic; /* Magic */ u32 iocs; /* Ioctl count */ } drm_client32_t; static int compat_drm_getclient(struct file *file, unsigned int cmd, unsigned long arg) { drm_client32_t c32; drm_client32_t __user *argp = (void __user *)arg; struct drm_client client; int err; if (copy_from_user(&c32, argp, sizeof(c32))) return -EFAULT; memset(&client, 0, sizeof(client)); client.idx = c32.idx; err = drm_ioctl_kernel(file, drm_getclient, &client, 0); if (err) return err; c32.idx = client.idx; c32.auth = client.auth; c32.pid = client.pid; c32.uid = client.uid; c32.magic = client.magic; c32.iocs = client.iocs; if (copy_to_user(argp, &c32, sizeof(c32))) return -EFAULT; return 0; } typedef struct drm_stats32 { u32 count; struct { u32 value; enum drm_stat_type type; } data[15]; } drm_stats32_t; static int compat_drm_getstats(struct file *file, unsigned int cmd, unsigned long arg) { drm_stats32_t __user *argp = (void __user *)arg; /* getstats is defunct, just clear */ if (clear_user(argp, sizeof(drm_stats32_t))) return -EFAULT; return 0; } #if defined(CONFIG_X86) typedef struct drm_update_draw32 { drm_drawable_t handle; unsigned int type; unsigned int num; /* 64-bit version has a 32-bit pad here */ u64 data; /**< Pointer */ } __packed drm_update_draw32_t; static int compat_drm_update_draw(struct file *file, unsigned int cmd, unsigned long arg) { /* update_draw is defunct */ return 0; } #endif struct drm_wait_vblank_request32 { enum drm_vblank_seq_type type; unsigned int sequence; u32 signal; }; struct drm_wait_vblank_reply32 { enum drm_vblank_seq_type type; unsigned int sequence; s32 tval_sec; s32 tval_usec; }; typedef union drm_wait_vblank32 { struct drm_wait_vblank_request32 request; struct drm_wait_vblank_reply32 reply; } drm_wait_vblank32_t; static int compat_drm_wait_vblank(struct file *file, unsigned int cmd, unsigned long arg) { drm_wait_vblank32_t __user *argp = (void __user *)arg; drm_wait_vblank32_t req32; union drm_wait_vblank req; int err; if (copy_from_user(&req32, argp, sizeof(req32))) return -EFAULT; memset(&req, 0, sizeof(req)); req.request.type = req32.request.type; req.request.sequence = req32.request.sequence; req.request.signal = req32.request.signal; err = drm_ioctl_kernel(file, drm_wait_vblank_ioctl, &req, 0); req32.reply.type = req.reply.type; req32.reply.sequence = req.reply.sequence; req32.reply.tval_sec = req.reply.tval_sec; req32.reply.tval_usec = req.reply.tval_usec; if (copy_to_user(argp, &req32, sizeof(req32))) return -EFAULT; return err; } #if defined(CONFIG_X86) typedef struct drm_mode_fb_cmd232 { u32 fb_id; u32 width; u32 height; u32 pixel_format; u32 flags; u32 handles[4]; u32 pitches[4]; u32 offsets[4]; u64 modifier[4]; } __packed drm_mode_fb_cmd232_t; static int compat_drm_mode_addfb2(struct file *file, unsigned int cmd, unsigned long arg) { struct drm_mode_fb_cmd232 __user *argp = (void __user *)arg; struct drm_mode_fb_cmd2 req64; int err; memset(&req64, 0, sizeof(req64)); if (copy_from_user(&req64, argp, offsetof(drm_mode_fb_cmd232_t, modifier))) return -EFAULT; if (copy_from_user(&req64.modifier, &argp->modifier, sizeof(req64.modifier))) return -EFAULT; err = drm_ioctl_kernel(file, drm_mode_addfb2, &req64, 0); if (err) return err; if (put_user(req64.fb_id, &argp->fb_id)) return -EFAULT; return 0; } #endif static struct { drm_ioctl_compat_t *fn; char *name; } drm_compat_ioctls[] = { #define DRM_IOCTL32_DEF(n, f) [DRM_IOCTL_NR(n##32)] = {.fn = f, .name = #n} DRM_IOCTL32_DEF(DRM_IOCTL_VERSION, compat_drm_version), DRM_IOCTL32_DEF(DRM_IOCTL_GET_UNIQUE, compat_drm_getunique), DRM_IOCTL32_DEF(DRM_IOCTL_GET_CLIENT, compat_drm_getclient), DRM_IOCTL32_DEF(DRM_IOCTL_GET_STATS, compat_drm_getstats), DRM_IOCTL32_DEF(DRM_IOCTL_SET_UNIQUE, compat_drm_setunique), #if defined(CONFIG_X86) DRM_IOCTL32_DEF(DRM_IOCTL_UPDATE_DRAW, compat_drm_update_draw), #endif DRM_IOCTL32_DEF(DRM_IOCTL_WAIT_VBLANK, compat_drm_wait_vblank), #if defined(CONFIG_X86) DRM_IOCTL32_DEF(DRM_IOCTL_MODE_ADDFB2, compat_drm_mode_addfb2), #endif }; /** * drm_compat_ioctl - 32bit IOCTL compatibility handler for DRM drivers * @filp: file this ioctl is called on * @cmd: ioctl cmd number * @arg: user argument * * Compatibility handler for 32 bit userspace running on 64 kernels. All actual * IOCTL handling is forwarded to drm_ioctl(), while marshalling structures as * appropriate. Note that this only handles DRM core IOCTLs, if the driver has * botched IOCTL itself, it must handle those by wrapping this function. * * Returns: * Zero on success, negative error code on failure. */ long drm_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { unsigned int nr = DRM_IOCTL_NR(cmd); struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; drm_ioctl_compat_t *fn; int ret; /* Assume that ioctls without an explicit compat routine will just * work. This may not always be a good assumption, but it's better * than always failing. */ if (nr >= ARRAY_SIZE(drm_compat_ioctls)) return drm_ioctl(filp, cmd, arg); fn = drm_compat_ioctls[nr].fn; if (!fn) return drm_ioctl(filp, cmd, arg); drm_dbg_core(dev, "comm=\"%s\", pid=%d, dev=0x%lx, auth=%d, %s\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file_priv->minor->kdev->devt), file_priv->authenticated, drm_compat_ioctls[nr].name); ret = (*fn)(filp, cmd, arg); if (ret) drm_dbg_core(dev, "ret = %d\n", ret); return ret; } EXPORT_SYMBOL(drm_compat_ioctl);
11896 11937 10028 10075 10084 10123 537 536 1061 1056 1058 1055 33 33 33 33 33 33 33 33 264 223 224 223 224 224 224 9 264 264 263 264 264 264 26 264 264 264 264 264 264 262 264 264 264 36 264 264 32 264 264 33 264 264 264 264 558 558 552 555 553 37 553 539 538 538 540 537 538 536 538 18 18 18 539 535 540 540 179 12 167 179 178 538 529 533 9 539 532 539 173 175 175 120 121 175 539 1 1 1 1 537 40 40 40 40 540 404 937 140 1036 644 1029 67 1034 186 185 1029 1032 1032 485 429 1033 1031 1034 663 25 25 537 538 535 156 156 156 25 156 25 24 25 538 537 133 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 // SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett */ #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/rhashtable.h> #include <linux/err.h> #include <linux/export.h> #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U union nested_table { union nested_table __rcu *table; struct rhash_lock_head __rcu *bucket; }; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) { return rht_head_hashfn(ht, tbl, he, ht->p); } #ifdef CONFIG_PROVE_LOCKING #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { if (!debug_locks) return 1; if (unlikely(tbl->nest)) return 1; return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) #endif static inline union nested_table *nested_table_top( const struct bucket_table *tbl) { /* The top-level bucket entry does not need RCU protection * because it's set at the same time as tbl->nest. */ return (void *)rcu_dereference_protected(tbl->buckets[0], 1); } static void nested_table_free(union nested_table *ntbl, unsigned int size) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); const unsigned int len = 1 << shift; unsigned int i; ntbl = rcu_dereference_protected(ntbl->table, 1); if (!ntbl) return; if (size > len) { size >>= shift; for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); } kfree(ntbl); } static void nested_bucket_table_free(const struct bucket_table *tbl) { unsigned int size = tbl->size >> tbl->nest; unsigned int len = 1 << tbl->nest; union nested_table *ntbl; unsigned int i; ntbl = nested_table_top(tbl); for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); kfree(ntbl); } static void bucket_table_free(const struct bucket_table *tbl) { if (tbl->nest) nested_bucket_table_free(tbl); kvfree(tbl); } static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); } static union nested_table *nested_table_alloc(struct rhashtable *ht, union nested_table __rcu **prev, bool leaf) { union nested_table *ntbl; int i; ntbl = rcu_dereference(*prev); if (ntbl) return ntbl; ntbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); if (ntbl && leaf) { for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) INIT_RHT_NULLS_HEAD(ntbl[i].bucket); } if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) return ntbl; /* Raced with another thread. */ kfree(ntbl); return rcu_dereference(*prev); } static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); struct bucket_table *tbl; size_t size; if (nbuckets < (1 << (shift + 1))) return NULL; size = sizeof(*tbl) + sizeof(tbl->buckets[0]); tbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(size, gfp|__GFP_ZERO)); if (!tbl) return NULL; if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, false)) { kfree(tbl); return NULL; } tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; return tbl; } static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { struct bucket_table *tbl = NULL; size_t size; int i; static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets), 1, gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } if (tbl == NULL) return NULL; lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); tbl->size = size; rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *new_tbl; do { new_tbl = tbl; tbl = rht_dereference_rcu(tbl->future_tbl, ht); } while (tbl); return new_tbl; } static int rhashtable_rehash_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); int err = -EAGAIN; struct rhash_head *head, *next, *entry; struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; unsigned long flags; if (new_tbl->nest) goto out; err = -ENOENT; rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); if (rht_is_a_nulls(next)) break; pprev = &entry->next; } if (err) goto out; new_hash = head_hashfn(ht, new_tbl, entry); flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING); head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); if (pprev) rcu_assign_pointer(*pprev, next); else /* Need to preserved the bit lock. */ rht_assign_locked(bkt, next); out: return err; } static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); unsigned long flags; int err; if (!bkt) return 0; flags = rht_lock(old_tbl, bkt); while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; if (err == -ENOENT) err = 0; rht_unlock(old_tbl, bkt, flags); return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. * As cmpxchg() provides strong barriers, we do not need * rcu_assign_pointer(). */ if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, new_tbl) != NULL) return -EEXIST; return 0; } static int rhashtable_rehash_table(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { err = rhashtable_rehash_chain(ht, old_hash); if (err) return err; cond_resched(); } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. * We do this inside the locked region so that * rhashtable_walk_stop() can use rcu_head_after_call_rcu() * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } static int rhashtable_rehash_alloc(struct rhashtable *ht, struct bucket_table *old_tbl, unsigned int size) { struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) bucket_table_free(new_tbl); return err; } /** * rhashtable_shrink - Shrink hash table while allowing concurrent lookups * @ht: the hash table to shrink * * This function shrinks the hash table to fit, i.e., the smallest * size would not cause it to expand right away automatically. * * The caller must ensure that no concurrent resizing occurs by holding * ht->mutex. * * The caller must ensure that no concurrent table mutations take place. * It is however valid to have concurrent lookups if they are RCU protected. * * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ static int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; if (old_tbl->size <= size) return 0; if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; int err = 0; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) err = rhashtable_shrink(ht); else if (tbl->nest) err = rhashtable_rehash_alloc(ht, tbl, tbl->size); if (!err || err == -EEXIST) { int nerr; nerr = rhashtable_rehash_table(ht); err = err ?: nerr; } mutex_unlock(&ht->mutex); if (err) schedule_work(&ht->run_work); } static int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); size = tbl->size; err = -EBUSY; if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) goto fail; err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { bucket_table_free(new_tbl); if (err == -EEXIST) err = 0; } else schedule_work(&ht->run_work); return err; fail: /* Do not fail the insert if someone else did a rehash. */ if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ if (err == -ENOMEM) schedule_work(&ht->run_work); return err; } static void *rhashtable_lookup_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, const void *key, struct rhash_head *obj) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; int elasticity; elasticity = RHT_ELASTICITY; rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; elasticity--; if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } if (!ht->rhlist) return rht_obj(ht, head); list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) rcu_assign_pointer(*pprev, obj); else /* Need to preserve the bit lock */ rht_assign_locked(bkt, obj); return NULL; } if (elasticity <= 0) return ERR_PTR(-EAGAIN); return ERR_PTR(-ENOENT); } static struct bucket_table *rhashtable_insert_one( struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, void *data) { struct bucket_table *new_tbl; struct rhash_head *head; if (!IS_ERR_OR_NULL(data)) return ERR_PTR(-EEXIST); if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) return ERR_CAST(data); new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (new_tbl) return new_tbl; if (PTR_ERR(data) != -ENOENT) return ERR_CAST(data); if (unlikely(rht_grow_above_max(ht, tbl))) return ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } /* bkt is always the head of the list, so it holds * the lock, which we need to preserve */ rht_assign_locked(bkt, obj); return NULL; } static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, struct rhash_head *obj) { struct bucket_table *new_tbl; struct bucket_table *tbl; struct rhash_lock_head __rcu **bkt; unsigned long flags; unsigned int hash; void *data; new_tbl = rcu_dereference(ht->tbl); do { tbl = new_tbl; hash = rht_head_hashfn(ht, tbl, obj, ht->p); if (rcu_access_pointer(tbl->future_tbl)) /* Failure is OK */ bkt = rht_bucket_var(tbl, hash); else bkt = rht_bucket_insert(ht, tbl, hash); if (bkt == NULL) { new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); data = ERR_PTR(-EAGAIN); } else { bool inserted; flags = rht_lock(tbl, bkt); data = rhashtable_lookup_one(ht, bkt, tbl, hash, key, obj); new_tbl = rhashtable_insert_one(ht, bkt, tbl, hash, obj, data); inserted = data && !new_tbl; if (inserted) atomic_inc(&ht->nelems); if (PTR_ERR(new_tbl) != -EEXIST) data = ERR_CAST(new_tbl); rht_unlock(tbl, bkt, flags); if (inserted && rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); } } while (!IS_ERR_OR_NULL(new_tbl)); if (PTR_ERR(data) == -EAGAIN) data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: -EAGAIN); return data; } void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj) { void *data; do { rcu_read_lock(); data = rhashtable_try_insert(ht, key, obj); rcu_read_unlock(); } while (PTR_ERR(data) == -EAGAIN); return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptible context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; iter->end_of_table = 0; spin_lock(&ht->lock); iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator * @iter: Hash table Iterator * * This function frees resources allocated by rhashtable_walk_enter. */ void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); /** * rhashtable_walk_start_check - Start a hash table walk * @iter: Hash table iterator * * Start a hash table walk at the current iterator position. Note that we take * the RCU lock in all cases including when we return an error. So you must * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may use it immediately * by calling rhashtable_walk_next. * * rhashtable_walk_start is defined as an inline variant that returns * void. This is preferred in cases where the caller would ignore * resize events and always continue. */ int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU) { struct rhashtable *ht = iter->ht; bool rhlist = ht->rhlist; rcu_read_lock(); spin_lock(&ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&ht->lock); if (iter->end_of_table) return 0; if (!iter->walker.tbl) { iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); iter->slot = 0; iter->skip = 0; return -EAGAIN; } if (iter->p && !rhlist) { /* * We need to validate that 'p' is still in the table, and * if so, update 'skip' */ struct rhash_head *p; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { skip++; if (p == iter->p) { iter->skip = skip; goto found; } } iter->p = NULL; } else if (iter->p && rhlist) { /* Need to validate that 'list' is still in the table, and * if so, update 'skip' and 'p'. */ struct rhash_head *p; struct rhlist_head *list; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { for (list = container_of(p, struct rhlist_head, rhead); list; list = rcu_dereference(list->next)) { skip++; if (list == iter->list) { iter->p = p; iter->skip = skip; goto found; } } } iter->p = NULL; } found: return 0; } EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); /** * __rhashtable_walk_find_next - Find the next element in a table (or the first * one in case of a new walk). * * @iter: Hash table iterator * * Returns the found object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. */ static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) { struct bucket_table *tbl = iter->walker.tbl; struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (!tbl) return NULL; for (; iter->slot < tbl->size; iter->slot++) { int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { if (rhlist) { list = container_of(p, struct rhlist_head, rhead); do { if (!skip) goto next; skip--; list = rcu_dereference(list->next); } while (list); continue; } if (!skip) break; skip--; } next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; } iter->p = NULL; /* Ensure we see any new tables. */ smp_rmb(); iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); } else { iter->end_of_table = true; } return NULL; } /** * rhashtable_walk_next - Return the next object and advance the iterator * @iter: Hash table iterator * * Note that you must call rhashtable_walk_stop when you are finished * with the walk. * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (p) { if (!rhlist || !(list = rcu_dereference(list->next))) { p = rcu_dereference(p->next); list = container_of(p, struct rhlist_head, rhead); } if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } /* At the end of this slot, switch to next one and then find * next entry from that point. */ iter->skip = 0; iter->slot++; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_next); /** * rhashtable_walk_peek - Return the next object but don't advance the iterator * @iter: Hash table iterator * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_peek(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; if (p) return rht_obj(ht, ht->rhlist ? &list->rhead : p); /* No object found in current iter, find next one in the table. */ if (iter->skip) { /* A nonzero skip value points to the next entry in the table * beyond that last one that was found. Decrement skip so * we find the current value. __rhashtable_walk_find_next * will restore the original value of skip assuming that * the table hasn't changed. */ iter->skip--; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_peek); /** * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * * Finish a hash table walk. Does not reset the iterator to the start of the * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; ht = iter->ht; spin_lock(&ht->lock); if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) /* This bucket table is being freed, don't re-link it. */ iter->walker.tbl = NULL; else list_add(&iter->walker.list, &tbl->walkers); spin_unlock(&ht->lock); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { size_t retsize; if (params->nelem_hint) retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), (unsigned long)params->min_size); else retsize = max(HASH_DEFAULT_SIZE, (unsigned long)params->min_size); return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) { return jhash2(key, length, seed); } /** * rhashtable_init - initialize a new hash table * @ht: hash table to be initialized * @params: configuration parameters * * Initializes a new hash table based on the provided configuration * parameters. A table can be configured either with a variable or * fixed length key: * * Configuration Example 1: Fixed length keys * struct test_obj { * int key; * void * my_member; * struct rhash_head node; * }; * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, * }; * * Configuration Example 2: Variable length keys * struct test_obj { * [...] * struct rhash_head node; * }; * * u32 my_hash_fn(const void *data, u32 len, u32 seed) * { * struct test_obj *obj = data; * * return [... hash ...]; * } * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .hashfn = jhash, * .obj_hashfn = my_hash_fn, * }; */ int rhashtable_init_noprof(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); alloc_tag_record(ht->alloc_tag); if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); /* Cap total entries at 2^31 to avoid nelems overflow. */ ht->max_elems = 1u << 31; if (params->max_size) { ht->p.max_size = rounddown_pow_of_two(params->max_size); if (ht->p.max_size < ht->max_elems / 2) ht->max_elems = ht->p.max_size * 2; } ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { ht->p.hashfn = jhash; if (!(ht->key_len & (sizeof(u32) - 1))) { ht->key_len /= sizeof(u32); ht->p.hashfn = rhashtable_jhash2; } } /* * This is api initialization and thus we need to guarantee the * initial rhashtable allocation. Upon failure, retry with the * smallest possible size with __GFP_NOFAIL semantics. */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (unlikely(tbl == NULL)) { size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); } atomic_set(&ht->nelems, 0); RCU_INIT_POINTER(ht->tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } EXPORT_SYMBOL_GPL(rhashtable_init_noprof); /** * rhltable_init - initialize a new hash list table * @hlt: hash list table to be initialized * @params: configuration parameters * * Initializes a new hash list table. * * See documentation for rhashtable_init. */ int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) { int err; err = rhashtable_init_noprof(&hlt->ht, params); hlt->ht.rhlist = true; return err; } EXPORT_SYMBOL_GPL(rhltable_init_noprof); static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, void (*free_fn)(void *ptr, void *arg), void *arg) { struct rhlist_head *list; if (!ht->rhlist) { free_fn(rht_obj(ht, obj), arg); return; } list = container_of(obj, struct rhlist_head, rhead); do { obj = &list->rhead; list = rht_dereference(list->next, ht); free_fn(rht_obj(ht, obj), arg); } while (list); } /** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * Stops an eventual async resize. If defined, invokes free_fn for each * element to releasal resources. Please note that RCU protected * readers may still be accessing the elements. Releasing of resources * must occur in a compatible manner. Then frees the bucket array. * * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { struct bucket_table *tbl, *next_tbl; unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); void rhashtable_destroy(struct rhashtable *ht) { return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; unsigned int subhash = hash; union nested_table *ntbl; ntbl = nested_table_top(tbl); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; while (ntbl && size > (1 << shift)) { index = subhash & ((1 << shift) - 1); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); size >>= shift; subhash >>= shift; } if (!ntbl) return NULL; return &ntbl[subhash].bucket; } EXPORT_SYMBOL_GPL(__rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { static struct rhash_lock_head __rcu *rhnull; if (!rhnull) INIT_RHT_NULLS_HEAD(rhnull); return __rht_bucket_nested(tbl, hash) ?: &rhnull; } EXPORT_SYMBOL_GPL(rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; ntbl = nested_table_top(tbl); hash >>= tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); while (ntbl && size > (1 << shift)) { index = hash & ((1 << shift) - 1); size >>= shift; hash >>= shift; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); } if (!ntbl) return NULL; return &ntbl[hash].bucket; } EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
424 6 731 6 6 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 /* SPDX-License-Identifier: GPL-2.0 */ /* * ioport.h Definitions of routines for detecting, reserving and * allocating system resources. * * Authors: Linus Torvalds */ #ifndef _LINUX_IOPORT_H #define _LINUX_IOPORT_H #ifndef __ASSEMBLY__ #include <linux/bits.h> #include <linux/compiler.h> #include <linux/minmax.h> #include <linux/types.h> /* * Resources are tree-like, allowing * nesting etc.. */ struct resource { resource_size_t start; resource_size_t end; const char *name; unsigned long flags; unsigned long desc; struct resource *parent, *sibling, *child; }; /* * IO resources have these defined flags. * * PCI devices expose these flags to userspace in the "resource" sysfs file, * so don't move them. */ #define IORESOURCE_BITS 0x000000ff /* Bus-specific bits */ #define IORESOURCE_TYPE_BITS 0x00001f00 /* Resource type */ #define IORESOURCE_IO 0x00000100 /* PCI/ISA I/O ports */ #define IORESOURCE_MEM 0x00000200 #define IORESOURCE_REG 0x00000300 /* Register offsets */ #define IORESOURCE_IRQ 0x00000400 #define IORESOURCE_DMA 0x00000800 #define IORESOURCE_BUS 0x00001000 #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */ #define IORESOURCE_READONLY 0x00004000 #define IORESOURCE_CACHEABLE 0x00008000 #define IORESOURCE_RANGELENGTH 0x00010000 #define IORESOURCE_SHADOWABLE 0x00020000 #define IORESOURCE_SIZEALIGN 0x00040000 /* size indicates alignment */ #define IORESOURCE_STARTALIGN 0x00080000 /* start field is alignment */ #define IORESOURCE_MEM_64 0x00100000 #define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */ #define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */ #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ /* IORESOURCE_SYSRAM specific bits. */ #define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ #define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 #define IORESOURCE_UNSET 0x20000000 /* No address assigned yet */ #define IORESOURCE_AUTO 0x40000000 #define IORESOURCE_BUSY 0x80000000 /* Driver has marked this resource busy */ /* I/O resource extended types */ #define IORESOURCE_SYSTEM_RAM (IORESOURCE_MEM|IORESOURCE_SYSRAM) /* PnP IRQ specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IRQ_HIGHEDGE (1<<0) #define IORESOURCE_IRQ_LOWEDGE (1<<1) #define IORESOURCE_IRQ_HIGHLEVEL (1<<2) #define IORESOURCE_IRQ_LOWLEVEL (1<<3) #define IORESOURCE_IRQ_SHAREABLE (1<<4) #define IORESOURCE_IRQ_OPTIONAL (1<<5) #define IORESOURCE_IRQ_WAKECAPABLE (1<<6) /* PnP DMA specific bits (IORESOURCE_BITS) */ #define IORESOURCE_DMA_TYPE_MASK (3<<0) #define IORESOURCE_DMA_8BIT (0<<0) #define IORESOURCE_DMA_8AND16BIT (1<<0) #define IORESOURCE_DMA_16BIT (2<<0) #define IORESOURCE_DMA_MASTER (1<<2) #define IORESOURCE_DMA_BYTE (1<<3) #define IORESOURCE_DMA_WORD (1<<4) #define IORESOURCE_DMA_SPEED_MASK (3<<6) #define IORESOURCE_DMA_COMPATIBLE (0<<6) #define IORESOURCE_DMA_TYPEA (1<<6) #define IORESOURCE_DMA_TYPEB (2<<6) #define IORESOURCE_DMA_TYPEF (3<<6) /* PnP memory I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_MEM_WRITEABLE (1<<0) /* dup: IORESOURCE_READONLY */ #define IORESOURCE_MEM_CACHEABLE (1<<1) /* dup: IORESOURCE_CACHEABLE */ #define IORESOURCE_MEM_RANGELENGTH (1<<2) /* dup: IORESOURCE_RANGELENGTH */ #define IORESOURCE_MEM_TYPE_MASK (3<<3) #define IORESOURCE_MEM_8BIT (0<<3) #define IORESOURCE_MEM_16BIT (1<<3) #define IORESOURCE_MEM_8AND16BIT (2<<3) #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) #define IORESOURCE_MEM_NONPOSTED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) #define IORESOURCE_IO_FIXED (1<<1) #define IORESOURCE_IO_SPARSE (1<<2) /* PCI ROM control bits (IORESOURCE_BITS) */ #define IORESOURCE_ROM_ENABLE (1<<0) /* ROM is enabled, same as PCI_ROM_ADDRESS_ENABLE */ #define IORESOURCE_ROM_SHADOW (1<<1) /* Use RAM image, not ROM BAR */ /* PCI control bits. Shares IORESOURCE_BITS with above PCI ROM. */ #define IORESOURCE_PCI_FIXED (1<<4) /* Do not move resource */ #define IORESOURCE_PCI_EA_BEI (1<<5) /* BAR Equivalent Indicator */ /* * I/O Resource Descriptors * * Descriptors are used by walk_iomem_res_desc() and region_intersects() * for searching a specific resource range in the iomem table. Assign * a new descriptor when a resource range supports the search interfaces. * Otherwise, resource.desc must be set to IORES_DESC_NONE (0). */ enum { IORES_DESC_NONE = 0, IORES_DESC_CRASH_KERNEL = 1, IORES_DESC_ACPI_TABLES = 2, IORES_DESC_ACPI_NV_STORAGE = 3, IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_RESERVED = 7, IORES_DESC_SOFT_RESERVED = 8, IORES_DESC_CXL = 9, }; /* * Flags controlling ioremap() behavior. */ enum { IORES_MAP_SYSTEM_RAM = BIT(0), IORES_MAP_ENCRYPTED = BIT(1), }; /* helpers to define resources */ #define DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, _desc) \ (struct resource) { \ .start = (_start), \ .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ .desc = (_desc), \ } #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE) #define DEFINE_RES(_start, _size, _flags) \ DEFINE_RES_NAMED(_start, _size, NULL, _flags) #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) #define DEFINE_RES_IO(_start, _size) \ DEFINE_RES_IO_NAMED((_start), (_size), NULL) #define DEFINE_RES_MEM_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_MEM) #define DEFINE_RES_MEM(_start, _size) \ DEFINE_RES_MEM_NAMED((_start), (_size), NULL) #define DEFINE_RES_REG_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_REG) #define DEFINE_RES_REG(_start, _size) \ DEFINE_RES_REG_NAMED((_start), (_size), NULL) #define DEFINE_RES_IRQ_NAMED(_irq, _name) \ DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ) #define DEFINE_RES_IRQ(_irq) \ DEFINE_RES_IRQ_NAMED((_irq), NULL) #define DEFINE_RES_DMA_NAMED(_dma, _name) \ DEFINE_RES_NAMED((_dma), 1, (_name), IORESOURCE_DMA) #define DEFINE_RES_DMA(_dma) \ DEFINE_RES_DMA_NAMED((_dma), NULL) /** * typedef resource_alignf - Resource alignment callback * @data: Private data used by the callback * @res: Resource candidate range (an empty resource space) * @size: The minimum size of the empty space * @align: Alignment from the constraints * * Callback allows calculating resource placement and alignment beyond min, * max, and align fields in the struct resource_constraint. * * Return: Start address for the resource. */ typedef resource_size_t (*resource_alignf)(void *data, const struct resource *res, resource_size_t size, resource_size_t align); /** * struct resource_constraint - constraints to be met while searching empty * resource space * @min: The minimum address for the memory range * @max: The maximum address for the memory range * @align: Alignment for the start address of the empty space * @alignf: Additional alignment constraints callback * @alignf_data: Data provided for @alignf callback * * Contains the range and alignment constraints that have to be met during * find_resource_space(). @alignf can be NULL indicating no alignment beyond * @align is necessary. */ struct resource_constraint { resource_size_t min, max, align; resource_alignf alignf; void *alignf_data; }; /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); void release_child_resources(struct resource *new); extern void reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name); extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); extern int remove_resource(struct resource *old); extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, resource_alignf alignf, void *alignf_data); struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); resource_size_t resource_alignment(struct resource *res); /** * resource_set_size - Calculate resource end address from size and start * @res: Resource descriptor * @size: Size of the resource * * Calculate the end address for @res based on @size. * * Note: The start address of @res must be set when calling this function. * Prefer resource_set_range() if setting both the start address and @size. */ static inline void resource_set_size(struct resource *res, resource_size_t size) { res->end = res->start + size - 1; } /** * resource_set_range - Set resource start and end addresses * @res: Resource descriptor * @start: Start address for the resource * @size: Size of the resource * * Set @res start address and calculate the end address based on @size. */ static inline void resource_set_range(struct resource *res, resource_size_t start, resource_size_t size) { res->start = start; resource_set_size(res, size); } static inline resource_size_t resource_size(const struct resource *res) { return res->end - res->start + 1; } static inline unsigned long resource_type(const struct resource *res) { return res->flags & IORESOURCE_TYPE_BITS; } static inline unsigned long resource_ext_type(const struct resource *res) { return res->flags & IORESOURCE_EXT_TYPE_BITS; } /* True iff r1 completely contains r2 */ static inline bool resource_contains(const struct resource *r1, const struct resource *r2) { if (resource_type(r1) != resource_type(r2)) return false; if (r1->flags & IORESOURCE_UNSET || r2->flags & IORESOURCE_UNSET) return false; return r1->start <= r2->start && r1->end >= r2->end; } /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(const struct resource *r1, const struct resource *r2) { return r1->start <= r2->end && r1->end >= r2->start; } static inline bool resource_intersection(const struct resource *r1, const struct resource *r2, struct resource *r) { if (!resource_overlaps(r1, r2)) return false; r->start = max(r1->start, r2->start); r->end = min(r1->end, r2->end); return true; } static inline bool resource_union(const struct resource *r1, const struct resource *r2, struct resource *r) { if (!resource_overlaps(r1, r2)) return false; r->start = min(r1->start, r2->start); r->end = max(r1->end, r2->end); return true; } /* * Check if this resource is added to a resource tree or detached. Caller is * responsible for not racing assignment. */ static inline bool resource_assigned(struct resource *res) { return res->parent; } int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint); /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), 0) #define request_muxed_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED) #define __request_mem_region(start,n,name, excl) __request_region(&iomem_resource, (start), (n), (name), excl) #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name), 0) #define request_mem_region_muxed(start, n, name) \ __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_MUXED) #define request_mem_region_exclusive(start,n,name) \ __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_EXCLUSIVE) #define rename_region(region, newname) do { (region)->name = (newname); } while (0) extern struct resource * __request_region(struct resource *, resource_size_t start, resource_size_t n, const char *name, int flags); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) #define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n)) extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE extern void release_mem_region_adjustable(resource_size_t, resource_size_t); #endif #ifdef CONFIG_MEMORY_HOTPLUG extern void merge_system_ram_resource(struct resource *res); #endif /* Wrappers for managed devices */ struct device; extern int devm_request_resource(struct device *dev, struct resource *root, struct resource *new); extern void devm_release_resource(struct device *dev, struct resource *new); #define devm_request_region(dev,start,n,name) \ __devm_request_region(dev, &ioport_resource, (start), (n), (name)) #define devm_request_mem_region(dev,start,n,name) \ __devm_request_region(dev, &iomem_resource, (start), (n), (name)) extern struct resource * __devm_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name); #define devm_release_region(dev, start, n) \ __devm_release_region(dev, &ioport_resource, (start), (n)) #define devm_release_mem_region(dev, start, n) \ __devm_release_region(dev, &iomem_resource, (start), (n)) extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); extern bool iomem_is_exclusive(u64 addr); extern bool resource_is_exclusive(struct resource *resource, u64 addr, resource_size_t size); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); extern int walk_mem_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_system_ram_res_rev(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); struct resource *alloc_free_mem_region(struct resource *base, unsigned long size, unsigned long align, const char *name); static inline void irqresource_disabled(struct resource *res, u32 irq) { res->start = irq; res->end = irq; res->flags |= IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET; } extern struct address_space *iomem_get_mapping(void); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */
34 34 34 5 18 18 5 5 5 5 5 4 4 1 4 4 4 4 4 4 4 2 2 7 20 2 18 20 1 19 3 16 1 16 30 30 22 20 1 1 27 17 1 17 17 16 16 16 16 16 16 16 16 16 16 17 17 17 29 12 2 12 29 27 27 27 27 21 21 5 21 18 3 3 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 2005-2006 Ian Kent <raven@themaw.net> */ #include <linux/seq_file.h> #include <linux/pagemap.h> #include "autofs_i.h" struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; ino = kzalloc(sizeof(*ino), GFP_KERNEL); if (ino) { INIT_LIST_HEAD(&ino->active); INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; ino->exp_timeout = -1; ino->count = 1; } return ino; } void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->exp_timeout = -1; ino->last_used = jiffies; } void autofs_free_ino(struct autofs_info *ino) { kfree_rcu(ino, rcu); } void autofs_kill_sb(struct super_block *sb) { struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock * info is not present so nothing else has been setup, so * just call kill_anon_super when we are called from * deactivate_super. */ if (sbi) { /* Free wait queues, close pipe */ autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } pr_debug("shutting down\n"); kill_anon_super(sb); if (sbi) kfree_rcu(sbi, rcu); } static int autofs_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, root_inode->i_uid)); if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); if (autofs_type_offset(sbi->type)) seq_puts(m, ",offset"); else if (autofs_type_direct(sbi->type)) seq_puts(m, ",direct"); else seq_puts(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_puts(m, ",strictexpire"); if (sbi->flags & AUTOFS_SBI_IGNORE) seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else seq_puts(m, ",pipe_ino=-1"); #endif return 0; } static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } static const struct super_operations autofs_sops = { .statfs = simple_statfs, .show_options = autofs_show_options, .evict_inode = autofs_evict_inode, }; enum { Opt_direct, Opt_fd, Opt_gid, Opt_ignore, Opt_indirect, Opt_maxproto, Opt_minproto, Opt_offset, Opt_pgrp, Opt_strictexpire, Opt_uid, }; const struct fs_parameter_spec autofs_param_specs[] = { fsparam_flag ("direct", Opt_direct), fsparam_fd ("fd", Opt_fd), fsparam_gid ("gid", Opt_gid), fsparam_flag ("ignore", Opt_ignore), fsparam_flag ("indirect", Opt_indirect), fsparam_u32 ("maxproto", Opt_maxproto), fsparam_u32 ("minproto", Opt_minproto), fsparam_flag ("offset", Opt_offset), fsparam_u32 ("pgrp", Opt_pgrp), fsparam_flag ("strictexpire", Opt_strictexpire), fsparam_uid ("uid", Opt_uid), {} }; struct autofs_fs_context { kuid_t uid; kgid_t gid; int pgrp; bool pgrp_set; }; /* * Open the fd. We do it here rather than in get_tree so that it's done in the * context of the system call that passed the data and not the one that * triggered the superblock creation, lest the fd gets reassigned. */ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, struct fs_parameter *param, struct fs_parse_result *result) { struct file *pipe; int ret; if (param->type == fs_value_is_file) { /* came through the new api */ pipe = param->file; param->file = NULL; } else { pipe = fget(result->uint_32); } if (!pipe) { errorf(fc, "could not open pipe file descriptor"); return -EBADF; } ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); fput(pipe); return -EBADF; } autofs_set_packet_pipe_flags(pipe); if (sbi->pipe) fput(sbi->pipe); sbi->pipefd = result->uint_32; sbi->pipe = pipe; return 0; } static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, autofs_param_specs, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_fd: return autofs_parse_fd(fc, sbi, param, &result); case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_pgrp: ctx->pgrp = result.uint_32; ctx->pgrp_set = true; break; case Opt_minproto: sbi->min_proto = result.uint_32; break; case Opt_maxproto: sbi->max_proto = result.uint_32; break; case Opt_indirect: set_autofs_type_indirect(&sbi->type); break; case Opt_direct: set_autofs_type_direct(&sbi->type); break; case Opt_offset: set_autofs_type_offset(&sbi->type); break; case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; case Opt_ignore: sbi->flags |= AUTOFS_SBI_IGNORE; } return 0; } static struct autofs_sb_info *autofs_alloc_sbi(void) { struct autofs_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->magic = AUTOFS_SBI_MAGIC; sbi->flags = AUTOFS_SBI_CATATONIC; sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; sbi->pipefd = -1; sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; set_autofs_type_indirect(&sbi->type); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); spin_lock_init(&sbi->lookup_lock); INIT_LIST_HEAD(&sbi->active_list); INIT_LIST_HEAD(&sbi->expiring_list); return sbi; } static int autofs_validate_protocol(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; /* Test versions first */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { errorf(fc, "kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); return -EINVAL; } /* Establish highest kernel protocol version */ if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) sbi->version = AUTOFS_MAX_PROTO_VERSION; else sbi->version = sbi->max_proto; switch (sbi->version) { case 4: sbi->sub_version = 7; break; case 5: sbi->sub_version = AUTOFS_PROTO_SUBVERSION; break; default: sbi->sub_version = 0; } return 0; } static int autofs_fill_super(struct super_block *s, struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = s->s_fs_info; struct inode *root_inode; struct autofs_info *ino; pr_debug("starting up, sbi = %p\n", sbi); sbi->sb = s; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs_sops; set_default_d_op(s, &autofs_dentry_operations); s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ ino = autofs_new_ino(sbi); if (!ino) return -ENOMEM; root_inode = autofs_get_inode(s, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; root_inode->i_uid = ctx->uid; root_inode->i_gid = ctx->gid; root_inode->i_fop = &autofs_root_operations; root_inode->i_op = &autofs_dir_inode_operations; s->s_root = d_make_root(root_inode); if (unlikely(!s->s_root)) { autofs_free_ino(ino); return -ENOMEM; } s->s_root->d_fsdata = ino; if (ctx->pgrp_set) { sbi->oz_pgrp = find_get_pid(ctx->pgrp); if (!sbi->oz_pgrp) return invalf(fc, "Could not find process group %d", ctx->pgrp); } else sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); if (autofs_type_trigger(sbi->type)) /* s->s_root won't be contended so there's little to * be gained by not taking the d_lock when setting * d_flags, even when a lot mounts are being done. */ managed_dentry_set_managed(s->s_root); pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); sbi->flags &= ~AUTOFS_SBI_CATATONIC; return 0; } /* * Validate the parameters and then request a superblock. */ static int autofs_get_tree(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; int ret; ret = autofs_validate_protocol(fc); if (ret) return ret; if (sbi->pipefd < 0) return invalf(fc, "No control pipe specified"); return get_tree_nodev(fc, autofs_fill_super); } static void autofs_free_fc(struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; if (sbi) { if (sbi->pipe) fput(sbi->pipe); kfree(sbi); } kfree(ctx); } static const struct fs_context_operations autofs_context_ops = { .free = autofs_free_fc, .parse_param = autofs_parse_param, .get_tree = autofs_get_tree, }; /* * Set up the filesystem mount context. */ int autofs_init_fs_context(struct fs_context *fc) { struct autofs_fs_context *ctx; struct autofs_sb_info *sbi; ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL); if (!ctx) goto nomem; ctx->uid = current_uid(); ctx->gid = current_gid(); sbi = autofs_alloc_sbi(); if (!sbi) goto nomem_ctx; fc->fs_private = ctx; fc->s_fs_info = sbi; fc->ops = &autofs_context_ops; return 0; nomem_ctx: kfree(ctx); nomem: return -ENOMEM; } struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (inode == NULL) return NULL; inode->i_mode = mode; if (sb->s_root) { inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } simple_inode_init_ts(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { set_nlink(inode, 2); inode->i_op = &autofs_dir_inode_operations; inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); return inode; }
8 1 1 1 2 2 2 2 1 5 1 1 5 1 1 1 5 5 5 5 5 5 5 2 2 3 3 3 5 5 2 5 1 1 1 5 5 5 2 3 3 5 5 5 1 10 10 10 5 2 1 2 5 5 4 2 5 4 10 5 6 9 9 6 6 1 1 6 10 9 2 9 1 9 7 7 2 7 1 7 1 7 1 7 1 7 7 7 10 11 10 8 7 8 8 8 8 8 8 8 8 11 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 // SPDX-License-Identifier: GPL-2.0-only /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <vtlam@google.com> * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ #define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ #define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ #define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ #define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ #define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ enum wdrr_bucket_idx { WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ }; #define hhf_time_before(a, b) \ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) /* Heavy-hitter per-flow state */ struct hh_flow_state { u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ u32 hit_timestamp; /* last time heavy-hitter was seen */ struct list_head flowchain; /* chaining under hash collision */ }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { struct sk_buff *head; struct sk_buff *tail; struct list_head bucketchain; int deficit; }; struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit */ struct list_head *hh_flows; /* table T (currently active HHs) */ u32 hh_flows_limit; /* max active HH allocs */ u32 hh_flows_overlimit; /* num of disallowed HH allocs */ u32 hh_flows_total_cnt; /* total admitted HHs */ u32 hh_flows_current_cnt; /* total current HHs */ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays * was reset */ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits * of hhf_arrays */ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ struct list_head new_buckets; /* list of new buckets */ struct list_head old_buckets; /* list of old buckets */ /* Configurable HHF parameters */ u32 hhf_reset_timeout; /* interval to reset counter * arrays in filter F * (default 40ms) */ u32 hhf_admit_bytes; /* counter thresh to classify as * HH (default 128KB). * With these default values, * 128KB / 40ms = 25 Mbps * i.e., we expect to capture HHs * sending > 25 Mbps. */ u32 hhf_evict_timeout; /* aging threshold to evict idle * HHs out of table T. This should * be large enough to avoid * reordering during HH eviction. * (default 1s) */ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs * (default 2, * i.e., non-HH : HH = 2 : 1) */ }; static u32 hhf_time_stamp(void) { return jiffies; } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow, *next; u32 now = hhf_time_stamp(); if (list_empty(head)) return NULL; list_for_each_entry_safe(flow, next, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) { /* Delete expired heavy-hitters, but preserve one entry * to avoid kzalloc() when next time this slot is hit. */ if (list_is_last(&flow->flowchain, head)) return NULL; list_del(&flow->flowchain); kfree(flow); q->hh_flows_current_cnt--; } else if (flow->hash_id == hash) { return flow; } } return NULL; } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow; u32 now = hhf_time_stamp(); if (!list_empty(head)) { /* Find an expired heavy-hitter flow entry. */ list_for_each_entry(flow, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) return flow; } } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { q->hh_flows_overlimit++; return NULL; } /* Create new entry. */ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); if (!flow) return NULL; q->hh_flows_current_cnt++; INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); return flow; } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); u32 tmp_hash, hash; u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; struct hh_flow_state *flow; u32 pkt_len, min_hhf_val; int i; u32 prev; u32 now = hhf_time_stamp(); /* Reset the HHF counter arrays if this is the right time. */ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; if (hhf_time_before(prev, now)) { for (i = 0; i < HHF_ARRAYS_CNT; i++) bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); q->hhf_arrays_reset_timestamp = now; } /* Get hashed flow-id of the skb. */ hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; flow = seek_list(hash, &q->hh_flows[flow_pos], q); if (flow) { /* found its HH flow */ flow->hit_timestamp = now; return WDRR_BUCKET_FOR_HH; } /* Now pass the packet through the multi-stage filter. */ tmp_hash = hash; xorsum = 0; for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { /* Split the skb_hash into three 10-bit chunks. */ filter_pos[i] = tmp_hash & HHF_BIT_MASK; xorsum ^= filter_pos[i]; tmp_hash >>= HHF_BIT_MASK_LEN; } /* The last chunk is computed as XOR sum of other chunks. */ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; pkt_len = qdisc_pkt_len(skb); min_hhf_val = ~0U; for (i = 0; i < HHF_ARRAYS_CNT; i++) { u32 val; if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { q->hhf_arrays[i][filter_pos[i]] = 0; __set_bit(filter_pos[i], q->hhf_valid_bits[i]); } val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; if (min_hhf_val > val) min_hhf_val = val; } /* Found a new HH iff all counter values > HH admit threshold. */ if (min_hhf_val > q->hhf_admit_bytes) { /* Just captured a new heavy-hitter. */ flow = alloc_new_hh(&q->hh_flows[flow_pos], q); if (!flow) /* memory alloc problem */ return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; q->hh_flows_total_cnt++; /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). */ return WDRR_BUCKET_FOR_HH; } /* Conservative update of HHF arrays (see Optimization O2). */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; } return WDRR_BUCKET_FOR_NON_HH; } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { struct sk_buff *skb = bucket->head; bucket->head = skb->next; skb_mark_not_on_list(skb); return skb; } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { if (bucket->head == NULL) bucket->head = skb; else bucket->tail->next = skb; bucket->tail = skb; skb->next = NULL; } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; /* Always try to drop from heavy-hitters first. */ bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; if (!bucket->head) bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; if (bucket->head) { struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; unsigned int prev_backlog; idx = hhf_classify(skb, sch); bucket = &q->buckets[idx]; bucket_add(bucket, skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; /* The logic of new_buckets vs. old_buckets is the same as * new_flows vs. old_flows in the implementation of fq_codel, * i.e., short bursts of non-HHs should have strict priority. */ if (idx == WDRR_BUCKET_FOR_HH) { /* Always move heavy-hitters to old bucket. */ weight = 1; list_add_tail(&bucket->bucketchain, &q->old_buckets); } else { weight = q->hhf_non_hh_weight; list_add_tail(&bucket->bucketchain, &q->new_buckets); } bucket->deficit = weight * q->quantum; } if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. */ if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct wdrr_bucket *bucket; struct list_head *head; begin: head = &q->new_buckets; if (list_empty(head)) { head = &q->old_buckets; if (list_empty(head)) return NULL; } bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); if (bucket->deficit <= 0) { int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? 1 : q->hhf_non_hh_weight; bucket->deficit += weight * q->quantum; list_move_tail(&bucket->bucketchain, &q->old_buckets); goto begin; } if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { /* Force a pass through old_buckets to prevent starvation. */ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) list_move_tail(&bucket->bucketchain, &q->old_buckets); else list_del_init(&bucket->bucketchain); goto begin; } qdisc_bstats_update(sch, skb); bucket->deficit -= qdisc_pkt_len(skb); return skb; } static void hhf_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { kvfree(q->hhf_arrays[i]); kvfree(q->hhf_valid_bits[i]); } if (!q->hh_flows) return; for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; if (list_empty(head)) continue; list_for_each_entry_safe(flow, next, head, flowchain) { list_del(&flow->flowchain); kfree(flow); } } kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); if (tb[TCA_HHF_NON_HH_WEIGHT]) new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); WRITE_ONCE(q->quantum, new_quantum); WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) WRITE_ONCE(q->hh_flows_limit, nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); WRITE_ONCE(q->hhf_reset_timeout, usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) WRITE_ONCE(q->hhf_admit_bytes, nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); WRITE_ONCE(q->hhf_evict_timeout, usecs_to_jiffies(us)); } while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; dropped_pkts++; dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); /* Configurable HHF parameters */ q->hhf_reset_timeout = HZ / 25; /* 40 ms */ q->hhf_admit_bytes = 131072; /* 128 KB */ q->hhf_evict_timeout = HZ; /* 1 sec */ q->hhf_non_hh_weight = 2; if (opt) { int err = hhf_change(sch, opt, extack); if (err) return err; } if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) INIT_LIST_HEAD(&q->hh_flows[i]); /* Cap max active HHs at twice len of hh_flows table. */ q->hh_flows_limit = 2 * HH_FLOWS_CNT; q->hh_flows_overlimit = 0; q->hh_flows_total_cnt = 0; q->hh_flows_current_cnt = 0; /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } q->hhf_arrays_reset_timestamp = hhf_time_stamp(); /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } /* Initialize Weighted DRR buckets. */ for (i = 0; i < WDRR_BUCKET_CNT; i++) { struct wdrr_bucket *bucket = q->buckets + i; INIT_LIST_HEAD(&bucket->bucketchain); } } return 0; } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { .drop_overlimit = q->drop_overlimit, .hh_overlimit = q->hh_flows_overlimit, .hh_tot_count = q->hh_flows_total_cnt, .hh_cur_count = q->hh_flows_current_cnt, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .id = "hhf", .priv_size = sizeof(struct hhf_sched_data), .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, .change = hhf_change, .dump = hhf_dump, .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { return register_qdisc(&hhf_qdisc_ops); } static void __exit hhf_module_exit(void) { unregister_qdisc(&hhf_qdisc_ops); } module_init(hhf_module_init) module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
62 63 63 63 49 49 49 49 49 38 63 63 63 62 33 18 63 60 60 60 60 59 32 7 60 60 60 60 60 60 60 60 5 5 5 5 32 59 60 60 63 62 1 9 3 6 9 9 9 5 5 5 3 2 5 3 3 2 1 5 9 9 6 3 3 2 3 1 4 4 4 4 2 4 4 3 4 4 2 2 1 2 4 1 1 1 1 1 1 1 1 1 1 1 1 2 2 50 50 50 49 48 50 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ipv6.h> #include <net/inet6_hashtables.h> #include <net/addrconf.h> #include "rds.h" #include "loop.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) /* converting this to RCU is a chore for another day.. */ static DEFINE_SPINLOCK(rds_conn_lock); static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, const struct in6_addr *faddr) { static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; __be32 lhash, fhash; u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } #define rds_conn_info_set(var, test, suffix) do { \ if (test) \ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; break; } } rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, laddr, faddr); return ret; } /* * This is called by transports as they're bringing down a connection. * It clears partial message state so that the transport can start sending * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; rdsdebug("connection %pI6c to %pI6c reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all * of them to the application. That is not consistent with the * reliability guarantees of RDS. */ } static void __rds_conn_path_init(struct rds_connection *conn, struct rds_conn_path *cp, bool is_outgoing) { spin_lock_init(&cp->cp_lock); cp->cp_next_tx_seq = 1; init_waitqueue_head(&cp->cp_waitq); INIT_LIST_HEAD(&cp->cp_send_queue); INIT_LIST_HEAD(&cp->cp_retrans); cp->cp_conn = conn; atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; cp->cp_reconnect_jiffies = 0; cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); mutex_init(&cp->cp_cm_lock); cp->cp_flags = 0; } /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so * span the lifetime of the actual underlying transport connections. * * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && ipv6_addr_equal(laddr, faddr) && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we * can stick the other QP. */ parent = conn; conn = parent->c_passive; } rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp); if (!conn->c_path) { kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-ENOMEM); goto out; } INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = *laddr; conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the * index used for this connection. Otherwise, set it to 0 as * the socket is not bound to an interface. c_bound_if is used * to look up a socket when a packet is received */ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) conn->c_bound_if = dev_if; else #endif conn->c_bound_if = 0; rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } /* * This is where a connection becomes loopback. If *any* RDS sockets * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; if (trans->t_prefer_loopback) { if (likely(is_outgoing)) { /* "outgoing" connection to local address. * Protocol says it wants the connection * handled by the loopback transport. * This is what TCP does. */ trans = &rds_loop_transport; } else { /* No transport currently in use * should end up here, but if it * does, reset/destroy the connection. */ kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-EOPNOTSUPP); goto out; } } } conn->c_trans = trans; init_waitqueue_head(&conn->c_hs_waitq); for (i = 0; i < npaths; i++) { __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; } rcu_read_lock(); if (rds_destroy_pending(conn)) ret = -ENETDOWN; else ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", conn, laddr, faddr, strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could * have created the same conn (either normal or passive) in the * interim. We check while holding the lock. If we won, we complete * init and return our conn. If we lost, we rollback and return the * other one. */ spin_lock_irqsave(&rds_conn_lock, flags); if (parent) { /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { parent->c_passive = conn; rds_cong_add_conn(conn); rds_conn_count++; } } else { /* Creating normal conn */ struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (found) { struct rds_conn_path *cp; int i; for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; /* The ->conn_alloc invocation may have * allocated resource for all paths, so all * of them may have to be freed here. */ if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { conn->c_my_gen_num = rds_gen_num; conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } } spin_unlock_irqrestore(&rds_conn_lock, flags); rcu_read_unlock(); out: return conn; } struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); void rds_conn_shutdown(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; /* shut it down unless it's down already */ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire * duration of the shutdown operation, else we may be * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ mutex_lock(&cp->cp_cm_lock); if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", atomic_read(&cp->cp_state)); mutex_unlock(&cp->cp_cm_lock); return; } mutex_unlock(&cp->cp_cm_lock); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); wait_event(cp->cp_waitq, !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); conn->c_trans->conn_path_shutdown(cp); rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproducible with loopback connections. * Mostly harmless. * * Note that this also happens with rds-tcp because * we could have triggered rds_conn_path_drop in irq * mode from rds_tcp_state change on the receipt of * a FIN, thus we need to recheck for RDS_CONN_ERROR * here. */ rds_conn_path_error(cp, "%s: failed to transition " "to state DOWN, current state " "is %d\n", __func__, atomic_read(&cp->cp_state)); return; } } /* Then reconnect if it's still live. * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over * all paths using rds_conn_path_destroy() */ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); rds_conn_path_drop(cp, true); flush_work(&cp->cp_down_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &cp->cp_send_queue, m_conn_item) { list_del_init(&rm->m_conn_item); BUG_ON(!list_empty(&rm->m_sock_item)); rds_message_put(rm); } if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); WARN_ON(delayed_work_pending(&cp->cp_send_w)); WARN_ON(delayed_work_pending(&cp->cp_recv_w)); WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } /* * Stop and free a connection. * * This can only be used in very limited circumstances. It assumes that once * the conn has been shutdown that no one else is referencing the connection. * We can only ensure this in the rmmod path in the current code. */ void rds_conn_destroy(struct rds_connection *conn) { unsigned long flags; int i; struct rds_conn_path *cp; int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); synchronize_rcu(); /* shut the connection down */ for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_destroy(cp); BUG_ON(!list_empty(&cp->cp_retrans)); } /* * The congestion maps aren't freed up here. They're * freed by rds_cong_exit() after all the connections * have been freed. */ rds_cong_remove_conn(conn); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, void *saddr, void *daddr, int flip, bool isv6) { #if IS_ENABLED(CONFIG_IPV6) if (isv6) rds6_inc_info_copy(inc, iter, saddr, daddr, flip); else #endif rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); } static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; unsigned int total = 0; unsigned long flags; size_t i; int j; if (isv6) len /= sizeof(struct rds6_info_message); else len /= sizeof(struct rds_info_message); rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; int npaths; if (!isv6 && conn->c_isv6) continue; npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); for (j = 0; j < npaths; j++) { cp = &conn->c_path[j]; if (want_send) list = &cp->cp_send_queue; else list = &cp->cp_retrans; spin_lock_irqsave(&cp->cp_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { total++; if (total <= len) __rds_inc_msg_cp(&rm->m_inc, iter, &conn->c_laddr, &conn->c_faddr, 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } } rcu_read_unlock(); lens->nr = total; if (isv6) lens->each = sizeof(struct rds6_info_message); else lens->each = sizeof(struct rds_info_message); } static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } #endif static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 1); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 1); } #endif static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 0); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 0); } #endif void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; /* XXX We only copy the information from the first * path for now. The problem is that if there are * more than one underlying paths, we cannot report * information of all of them using the existing * API. For example, there is only one next_tx_seq, * which path's next_tx_seq should we report? It is * a bug in the design of MPRDS. */ cp = conn->c_path; /* XXX no cp_lock usage.. */ if (!visitor(cp, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; if (conn->c_isv6) return 0; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; strscpy_pad(cinfo->transport, conn->c_trans->t_name); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } #if IS_ENABLED(CONFIG_IPV6) static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds6_info_connection *cinfo6 = buffer; struct rds_connection *conn = cp->cp_conn; cinfo6->next_tx_seq = cp->cp_next_tx_seq; cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; strscpy_pad(cinfo6->transport, conn->c_trans->t_name); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); /* Just return 1 as there is no error case. This is a helper function * for rds_walk_conn_path_info() and it wants a return value. */ return 1; } #endif static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, buffer, sizeof(struct rds_info_connection)); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds6_conn_info_visitor, buffer, sizeof(struct rds6_info_connection)); } #endif int rds_conn_init(void) { int ret; ret = rds_loop_net_init(); /* register pernet callback */ if (ret) return ret; rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_register_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif return 0; } void rds_conn_exit(void) { rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); kmem_cache_destroy(rds_conn_slab); rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif } /* * Force a disconnect */ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(rds_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_drop(&conn->c_path[0], false); } EXPORT_SYMBOL_GPL(rds_conn_drop); /* * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); /* Check connectivity of all paths */ void rds_check_all_paths(struct rds_connection *conn) { int i = 0; do { rds_conn_path_connect_if_down(&conn->c_path[i]); } while (++i < conn->c_npaths); } void rds_conn_connect_if_down(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); void __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintk(fmt, ap); va_end(ap); rds_conn_path_drop(cp, false); }
9 8 7 6 5 9 4 3 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> struct nft_byteorder { u8 sreg; u8 dreg; enum nft_byteorder_ops op:8; u8 len; u8 size; }; void nft_byteorder_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = &regs->data[priv->sreg]; u32 *dst = &regs->data[priv->dreg]; u16 *s16, *d16; unsigned int i; s16 = (void *)src; d16 = (void *)dst; switch (priv->size) { case 8: { u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); nft_reg_store64(&dst64[i], src64); } break; } break; } case 4: switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) dst[i] = (__force __u32)htonl(src[i]); break; } break; case 2: switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) d16[i] = (__force __u16)htons(s16[i]); break; } break; } } static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = { [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 }, [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_byteorder_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_byteorder *priv = nft_expr_priv(expr); u32 size, len; int err; if (tb[NFTA_BYTEORDER_SREG] == NULL || tb[NFTA_BYTEORDER_DREG] == NULL || tb[NFTA_BYTEORDER_LEN] == NULL || tb[NFTA_BYTEORDER_SIZE] == NULL || tb[NFTA_BYTEORDER_OP] == NULL) return -EINVAL; priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); switch (priv->op) { case NFT_BYTEORDER_NTOH: case NFT_BYTEORDER_HTON: break; default: return -EINVAL; } err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size); if (err < 0) return err; priv->size = size; switch (priv->size) { case 2: case 4: case 8: break; default: return -EINVAL; } err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); if (err < 0) return err; priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, priv->len); if (err < 0) return err; return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_byteorder *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_BYTEORDER_SREG, priv->sreg)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_BYTEORDER_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_byteorder_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { struct nft_byteorder *priv = nft_expr_priv(expr); nft_reg_track_cancel(track, priv->dreg, priv->len); return false; } static const struct nft_expr_ops nft_byteorder_ops = { .type = &nft_byteorder_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)), .eval = nft_byteorder_eval, .init = nft_byteorder_init, .dump = nft_byteorder_dump, .reduce = nft_byteorder_reduce, }; struct nft_expr_type nft_byteorder_type __read_mostly = { .name = "byteorder", .ops = &nft_byteorder_ops, .policy = nft_byteorder_policy, .maxattr = NFTA_BYTEORDER_MAX, .owner = THIS_MODULE, };
5892 5082 1304 1305 5579 32 4 215 554 358 1 539 206 360 360 357 357 525 186 198 198 13 12 12 13 19 19 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal schedule timeout and sleeping functions */ #include <linux/delay.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include "tick-internal.h" /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns: 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { pr_err("%s: wrong timeout value %lx\n", __func__, timeout); dump_stack(); __set_current_state(TASK_RUNNING); goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); timer.timer.expires = expire; add_timer(&timer.timer); schedule(); timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * __set_current_state() can be used in schedule_timeout_*() functions, because * schedule_timeout() calls schedule() unconditionally. */ /** * schedule_timeout_interruptible - sleep until timeout (interruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); /** * schedule_timeout_killable - sleep until timeout (killable) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_KILLABLE before starting the timeout. */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); /** * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /** * schedule_timeout_idle - sleep until timeout (idle) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_IDLE before starting the timeout. It is similar to * schedule_timeout_uninterruptible(), except this task will not contribute to * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used * * Details are explained in schedule_hrtimeout_range() function description as * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns: 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * See schedule_hrtimeout_range() for details. @delta argument of * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Requested sleep duration in milliseconds * * msleep() uses jiffy based timeouts for the sleep duration. Because of the * design of the timer wheel, the maximum additional percentage delay (slack) is * 12.5%. This is only valid for timers which will end up in level 1 or a higher * level of the timer wheel. For explanation of those 12.5% please check the * detailed description about the basics of the timer wheel. * * The slack of timers which will end up in level 0 depends on sleep duration * (msecs) and HZ configuration and can be calculated in the following way (with * the timer wheel design restriction that the slack is not less than 12.5%): * * ``slack = MSECS_PER_TICK / msecs`` * * When the allowed slack of the callsite is known, the calculation could be * turned around to find the minimal allowed sleep duration to meet the * constraints. For example: * * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: * all sleep durations greater or equal 4ms will meet the constraints. * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: * all sleep durations greater or equal 8ms will meet the constraints. * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: * all sleep durations greater or equal 16ms will meet the constraints. * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: * all sleep durations greater or equal 32ms will meet the constraints. * * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Requested sleep duration in milliseconds * * See msleep() for some basic information. * * The difference between msleep() and msleep_interruptible() is that the sleep * could be interrupted by a signal delivery and then returns early. * * Returns: The remaining time of the sleep duration transformed to msecs (see * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range_state - Sleep for an approximate time in a given state * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * * usleep_range_state() sleeps at least for the minimum specified time but not * longer than the maximum specified amount of time. The range might reduce * power usage by allowing hrtimers to coalesce an already scheduled interrupt * with this hrtimer. In the worst case, an interrupt is scheduled for the upper * bound. * * The sleeping task is set to the specified state before starting the sleep. * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range() or its variants instead of udelay(). The sleep improves * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; if (WARN_ON_ONCE(max < min)) delta = 0; for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range_state);
9849 9846 9832 596 9845 206 150 28 80 442 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <linux/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
116 116 116 42 42 41 95 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/ethtool.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); seq_printf(seq, "Peer Notification Delay (ms): %d\n", bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP Missed Max: %u\n", bond->params.missed_max); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); #if IS_ENABLED(CONFIG_IPV6) printed = 0; seq_printf(seq, "NS IPv6 target/s (xx::xx form):"); for (i = 0; (i < BOND_MAX_NS_TARGETS); i++) { if (ipv6_addr_any(&bond->params.ns_targets[i])) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI6c", &bond->params.ns_targets[i]); printed = 1; } seq_printf(seq, "\n"); #endif } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP active: %s\n", (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", READ_ONCE(slave->queue_id)); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444, bn->proc_dir, &bond_info_seq_ops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } }
13 3 3 3 3 56 10 10 10 10 58 3 3 3 3 3 3 3 3 7 7 7 7 2 3 3 3 3 3 3 2 3 3 3 3 3 2 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 2 3 3 3 3 3 2 3 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 3 3 2 2 3 3 2 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 2 3 3 3 7 7 7 7 7 7 7 7 2 7 3 3 3 3 3 3 3 3 2 3 3 2 3 3 3 2 3 3 3 3 2 3 2 3 3 2 3 3 3 3 3 3 3 3 3 3 3 52 52 52 52 52 52 52 52 52 52 52 52 52 52 52 52 40 8 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org> * Copyright (C) 2019-2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/debugfs.h> #include <linux/random.h> #include <linux/moduleparam.h> #include <linux/ieee80211.h> #include <linux/minmax.h> #include <net/mac80211.h> #include "rate.h" #include "sta_info.h" #include "rc80211_minstrel_ht.h" #define AVG_AMPDU_SIZE 16 #define AVG_PKT_SIZE 1200 /* Number of bits for an average sized packet */ #define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3) /* Number of symbols for a packet with (bps) bits per symbol */ #define MCS_NSYMS(bps) DIV_ROUND_UP(MCS_NBITS, (bps)) /* Transmission time (nanoseconds) for a packet containing (syms) symbols */ #define MCS_SYMBOL_TIME(sgi, syms) \ (sgi ? \ ((syms) * 18000 + 4000) / 5 : /* syms * 3.6 us */ \ ((syms) * 1000) << 2 /* syms * 4 us */ \ ) /* Transmit duration for the raw data part of an average sized packet */ #define MCS_DURATION(streams, sgi, bps) \ (MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) / AVG_AMPDU_SIZE) #define BW_20 0 #define BW_40 1 #define BW_80 2 /* * Define group sort order: HT40 -> SGI -> #streams */ #define GROUP_IDX(_streams, _sgi, _ht40) \ MINSTREL_HT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * _ht40 + \ MINSTREL_MAX_STREAMS * _sgi + \ _streams - 1 #define _MAX(a, b) (((a)>(b))?(a):(b)) #define GROUP_SHIFT(duration) \ _MAX(0, 16 - __builtin_clz(duration)) /* MCS rate information for an MCS group */ #define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ [GROUP_IDX(_streams, _sgi, _ht40)] = { \ .streams = _streams, \ .shift = _s, \ .bw = _ht40, \ .flags = \ IEEE80211_TX_RC_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) >> _s \ } \ } #define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) #define MCS_GROUP(_streams, _sgi, _ht40) \ __MCS_GROUP(_streams, _sgi, _ht40, \ MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) #define VHT_GROUP_IDX(_streams, _sgi, _bw) \ (MINSTREL_VHT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * (_bw) + \ MINSTREL_MAX_STREAMS * (_sgi) + \ (_streams) - 1) #define BW2VBPS(_bw, r3, r2, r1) \ (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) #define __VHT_GROUP(_streams, _sgi, _bw, _s) \ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ .streams = _streams, \ .shift = _s, \ .bw = _bw, \ .flags = \ IEEE80211_TX_RC_VHT_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_bw == BW_80 ? IEEE80211_TX_RC_80_MHZ_WIDTH : \ _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 117, 54, 26)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 234, 108, 52)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 351, 162, 78)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 468, 216, 104)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 702, 324, 156)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 936, 432, 208)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1053, 486, 234)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1170, 540, 260)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1404, 648, 312)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1560, 720, 346)) >> _s \ } \ } #define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 117, 54, 26))) #define VHT_GROUP(_streams, _sgi, _bw) \ __VHT_GROUP(_streams, _sgi, _bw, \ VHT_GROUP_SHIFT(_streams, _sgi, _bw)) #define CCK_DURATION(_bitrate, _short) \ (1000 * (10 /* SIFS */ + \ (_short ? 72 + 24 : 144 + 48) + \ (8 * (AVG_PKT_SIZE + 4) * 10) / (_bitrate))) #define CCK_DURATION_LIST(_short, _s) \ CCK_DURATION(10, _short) >> _s, \ CCK_DURATION(20, _short) >> _s, \ CCK_DURATION(55, _short) >> _s, \ CCK_DURATION(110, _short) >> _s #define __CCK_GROUP(_s) \ [MINSTREL_CCK_GROUP] = { \ .streams = 1, \ .flags = 0, \ .shift = _s, \ .duration = { \ CCK_DURATION_LIST(false, _s), \ CCK_DURATION_LIST(true, _s) \ } \ } #define CCK_GROUP_SHIFT \ GROUP_SHIFT(CCK_DURATION(10, false)) #define CCK_GROUP __CCK_GROUP(CCK_GROUP_SHIFT) #define OFDM_DURATION(_bitrate) \ (1000 * (16 /* SIFS + signal ext */ + \ 16 /* T_PREAMBLE */ + \ 4 /* T_SIGNAL */ + \ 4 * (((16 + 80 * (AVG_PKT_SIZE + 4) + 6) / \ ((_bitrate) * 4))))) #define OFDM_DURATION_LIST(_s) \ OFDM_DURATION(60) >> _s, \ OFDM_DURATION(90) >> _s, \ OFDM_DURATION(120) >> _s, \ OFDM_DURATION(180) >> _s, \ OFDM_DURATION(240) >> _s, \ OFDM_DURATION(360) >> _s, \ OFDM_DURATION(480) >> _s, \ OFDM_DURATION(540) >> _s #define __OFDM_GROUP(_s) \ [MINSTREL_OFDM_GROUP] = { \ .streams = 1, \ .flags = 0, \ .shift = _s, \ .duration = { \ OFDM_DURATION_LIST(_s), \ } \ } #define OFDM_GROUP_SHIFT \ GROUP_SHIFT(OFDM_DURATION(60)) #define OFDM_GROUP __OFDM_GROUP(OFDM_GROUP_SHIFT) static bool minstrel_vht_only = true; module_param(minstrel_vht_only, bool, 0644); MODULE_PARM_DESC(minstrel_vht_only, "Use only VHT rates when VHT is supported by sta."); /* * To enable sufficiently targeted rate sampling, MCS rates are divided into * groups, based on the number of streams and flags (HT40, SGI) that they * use. * * Sortorder has to be fixed for GROUP_IDX macro to be applicable: * BW -> SGI -> #streams */ const struct mcs_group minstrel_mcs_groups[] = { MCS_GROUP(1, 0, BW_20), MCS_GROUP(2, 0, BW_20), MCS_GROUP(3, 0, BW_20), MCS_GROUP(4, 0, BW_20), MCS_GROUP(1, 1, BW_20), MCS_GROUP(2, 1, BW_20), MCS_GROUP(3, 1, BW_20), MCS_GROUP(4, 1, BW_20), MCS_GROUP(1, 0, BW_40), MCS_GROUP(2, 0, BW_40), MCS_GROUP(3, 0, BW_40), MCS_GROUP(4, 0, BW_40), MCS_GROUP(1, 1, BW_40), MCS_GROUP(2, 1, BW_40), MCS_GROUP(3, 1, BW_40), MCS_GROUP(4, 1, BW_40), CCK_GROUP, OFDM_GROUP, VHT_GROUP(1, 0, BW_20), VHT_GROUP(2, 0, BW_20), VHT_GROUP(3, 0, BW_20), VHT_GROUP(4, 0, BW_20), VHT_GROUP(1, 1, BW_20), VHT_GROUP(2, 1, BW_20), VHT_GROUP(3, 1, BW_20), VHT_GROUP(4, 1, BW_20), VHT_GROUP(1, 0, BW_40), VHT_GROUP(2, 0, BW_40), VHT_GROUP(3, 0, BW_40), VHT_GROUP(4, 0, BW_40), VHT_GROUP(1, 1, BW_40), VHT_GROUP(2, 1, BW_40), VHT_GROUP(3, 1, BW_40), VHT_GROUP(4, 1, BW_40), VHT_GROUP(1, 0, BW_80), VHT_GROUP(2, 0, BW_80), VHT_GROUP(3, 0, BW_80), VHT_GROUP(4, 0, BW_80), VHT_GROUP(1, 1, BW_80), VHT_GROUP(2, 1, BW_80), VHT_GROUP(3, 1, BW_80), VHT_GROUP(4, 1, BW_80), }; const s16 minstrel_cck_bitrates[4] = { 10, 20, 55, 110 }; const s16 minstrel_ofdm_bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; static const u8 minstrel_sample_seq[] = { MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_JUMP, MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_JUMP, MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_SLOW, }; static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); /* * Some VHT MCSes are invalid (when Ndbps / Nes is not an integer) * e.g for MCS9@20MHzx1Nss: Ndbps=8x52*(5/6) Nes=1 * * Returns the valid mcs map for struct minstrel_mcs_group_data.supported */ static u16 minstrel_get_valid_vht_rates(int bw, int nss, __le16 mcs_map) { u16 mask = 0; if (bw == BW_20) { if (nss != 3 && nss != 6) mask = BIT(9); } else if (bw == BW_80) { if (nss == 3 || nss == 7) mask = BIT(6); else if (nss == 6) mask = BIT(9); } else { WARN_ON(bw != BW_40); } switch ((le16_to_cpu(mcs_map) >> (2 * (nss - 1))) & 3) { case IEEE80211_VHT_MCS_SUPPORT_0_7: mask |= 0x300; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: mask |= 0x200; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: break; default: mask = 0x3ff; } return 0x3ff & ~mask; } static bool minstrel_ht_is_legacy_group(int group) { return group == MINSTREL_CCK_GROUP || group == MINSTREL_OFDM_GROUP; } /* * Look up an MCS group index based on mac80211 rate information */ static int minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) { return GROUP_IDX((rate->idx / 8) + 1, !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)); } /* * Look up an MCS group index based on new cfg80211 rate_info. */ static int minstrel_ht_ri_get_group_idx(struct rate_info *rate) { return GROUP_IDX((rate->mcs / 8) + 1, !!(rate->flags & RATE_INFO_FLAGS_SHORT_GI), !!(rate->bw & RATE_INFO_BW_40)); } static int minstrel_vht_get_group_idx(struct ieee80211_tx_rate *rate) { return VHT_GROUP_IDX(ieee80211_rate_get_vht_nss(rate), !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + 2*!!(rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)); } /* * Look up an MCS group index based on new cfg80211 rate_info. */ static int minstrel_vht_ri_get_group_idx(struct rate_info *rate) { return VHT_GROUP_IDX(rate->nss, !!(rate->flags & RATE_INFO_FLAGS_SHORT_GI), !!(rate->bw & RATE_INFO_BW_40) + 2*!!(rate->bw & RATE_INFO_BW_80)); } static struct minstrel_rate_stats * minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_tx_rate *rate) { int group, idx; if (rate->flags & IEEE80211_TX_RC_MCS) { group = minstrel_ht_get_group_idx(rate); idx = rate->idx % 8; goto out; } if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { group = minstrel_vht_get_group_idx(rate); idx = ieee80211_rate_get_vht_mcs(rate); goto out; } group = MINSTREL_CCK_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) { if (!(mi->supported[group] & BIT(idx))) continue; if (rate->idx != mp->cck_rates[idx]) continue; /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)) idx += 4; goto out; } group = MINSTREL_OFDM_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->ofdm_rates[0]); idx++) if (rate->idx == mp->ofdm_rates[mi->band][idx]) goto out; idx = 0; out: return &mi->groups[group].rates[idx]; } /* * Get the minstrel rate statistics for specified STA and rate info. */ static struct minstrel_rate_stats * minstrel_ht_ri_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_rate_status *rate_status) { int group, idx; struct rate_info *rate = &rate_status->rate_idx; if (rate->flags & RATE_INFO_FLAGS_MCS) { group = minstrel_ht_ri_get_group_idx(rate); idx = rate->mcs % 8; goto out; } if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) { group = minstrel_vht_ri_get_group_idx(rate); idx = rate->mcs; goto out; } group = MINSTREL_CCK_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) { if (rate->legacy != minstrel_cck_bitrates[ mp->cck_rates[idx] ]) continue; /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && mi->use_short_preamble) idx += 4; goto out; } group = MINSTREL_OFDM_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->ofdm_rates[0]); idx++) if (rate->legacy == minstrel_ofdm_bitrates[ mp->ofdm_rates[mi->band][idx] ]) goto out; idx = 0; out: return &mi->groups[group].rates[idx]; } static inline struct minstrel_rate_stats * minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) { return &mi->groups[MI_RATE_GROUP(index)].rates[MI_RATE_IDX(index)]; } static inline int minstrel_get_duration(int index) { const struct mcs_group *group = &minstrel_mcs_groups[MI_RATE_GROUP(index)]; unsigned int duration = group->duration[MI_RATE_IDX(index)]; return duration << group->shift; } static unsigned int minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) { int duration; if (mi->avg_ampdu_len) return MINSTREL_TRUNC(mi->avg_ampdu_len); if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_tp_rate[0]))) return 1; duration = minstrel_get_duration(mi->max_tp_rate[0]); if (duration > 400 * 1000) return 2; if (duration > 250 * 1000) return 4; if (duration > 150 * 1000) return 8; return 16; } /* * Return current throughput based on the average A-MPDU length, taking into * account the expected number of retransmissions and their expected length */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, int prob_avg) { unsigned int nsecs = 0, overhead = mi->overhead; unsigned int ampdu_len = 1; /* do not account throughput if success prob is below 10% */ if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (minstrel_ht_is_legacy_group(group)) overhead = mi->overhead_legacy; else ampdu_len = minstrel_ht_avg_ampdu_len(mi); nsecs = 1000 * overhead / ampdu_len; nsecs += minstrel_mcs_groups[group].duration[rate] << minstrel_mcs_groups[group].shift; /* * For the throughput calculation, limit the probability value to 90% to * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ if (prob_avg > MINSTREL_FRAC(90, 100)) prob_avg = MINSTREL_FRAC(90, 100); return MINSTREL_TRUNC(100 * ((prob_avg * 1000000) / nsecs)); } /* * Find & sort topmost throughput rates * * If multiple rates provide equal throughput the sorting is based on their * current success probability. Higher success probability is preferred among * MCS groups, CCK rates do not provide aggregation and are therefore at last. */ static void minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, u16 *tp_list) { int cur_group, cur_idx, cur_tp_avg, cur_prob; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int j = MAX_THR_RATES; cur_group = MI_RATE_GROUP(index); cur_idx = MI_RATE_IDX(index); cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = MI_RATE_GROUP(tp_list[j - 1]); tmp_idx = MI_RATE_IDX(tp_list[j - 1]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || (cur_tp_avg == tmp_tp_avg && cur_prob <= tmp_prob)) break; j--; } while (j > 0); if (j < MAX_THR_RATES - 1) { memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * (MAX_THR_RATES - (j + 1)))); } if (j < MAX_THR_RATES) tp_list[j] = index; } /* * Find and set the topmost probability rate per sta and per group */ static void minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 *dest, u16 index) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int max_tp_group, max_tp_idx, max_tp_prob; int cur_tp_avg, cur_group, cur_idx; int max_gpr_group, max_gpr_idx; int max_gpr_tp_avg, max_gpr_prob; cur_group = MI_RATE_GROUP(index); cur_idx = MI_RATE_IDX(index); mg = &mi->groups[cur_group]; mrs = &mg->rates[cur_idx]; tmp_group = MI_RATE_GROUP(*dest); tmp_idx = MI_RATE_IDX(*dest); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ max_tp_group = MI_RATE_GROUP(mi->max_tp_rate[0]); max_tp_idx = MI_RATE_IDX(mi->max_tp_rate[0]); max_tp_prob = mi->groups[max_tp_group].rates[max_tp_idx].prob_avg; if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index)) && !minstrel_ht_is_legacy_group(max_tp_group)) return; /* skip rates faster than max tp rate with lower prob */ if (minstrel_get_duration(mi->max_tp_rate[0]) > minstrel_get_duration(index) && mrs->prob_avg < max_tp_prob) return; max_gpr_group = MI_RATE_GROUP(mg->max_group_prob_rate); max_gpr_idx = MI_RATE_IDX(mg->max_group_prob_rate); max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) *dest = index; max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, max_gpr_idx, max_gpr_prob); if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { if (mrs->prob_avg > tmp_prob) *dest = index; if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } /* * Assign new rate set per sta and use CCK rates only if the fastest * rate (max_tp_rate[0]) is from CCK group. This prohibits such sorted * rate sets where MCS and CCK rates are mixed, because CCK rates can * not use aggregation. */ static void minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, u16 tmp_mcs_tp_rate[MAX_THR_RATES], u16 tmp_legacy_tp_rate[MAX_THR_RATES]) { unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp, tmp_prob; int i; tmp_group = MI_RATE_GROUP(tmp_legacy_tp_rate[0]); tmp_idx = MI_RATE_IDX(tmp_legacy_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = MI_RATE_GROUP(tmp_mcs_tp_rate[0]); tmp_idx = MI_RATE_IDX(tmp_mcs_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_legacy_tp_rate[i], tmp_mcs_tp_rate); } } } /* * Try to increase robustness of max_prob rate by decrease number of * streams if possible. */ static inline void minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; int tmp_max_streams, group, tmp_idx, tmp_prob; int tmp_tp = 0; if (!mi->sta->deflink.ht_cap.ht_supported) return; group = MI_RATE_GROUP(mi->max_tp_rate[0]); tmp_max_streams = minstrel_mcs_groups[group].streams; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; if (!mi->supported[group] || group == MINSTREL_CCK_GROUP) continue; tmp_idx = MI_RATE_IDX(mg->max_group_prob_rate); tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { mi->max_prob_rate = mg->max_group_prob_rate; tmp_tp = minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob); } } } static u16 __minstrel_ht_get_sample_rate(struct minstrel_ht_sta *mi, enum minstrel_sample_type type) { u16 *rates = mi->sample[type].sample_rates; u16 cur; int i; for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { if (!rates[i]) continue; cur = rates[i]; rates[i] = 0; return cur; } return 0; } static inline int minstrel_ewma(int old, int new, int weight) { int diff, incr; diff = new - old; incr = (EWMA_DIV - weight) * diff / EWMA_DIV; return old + incr; } static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) { s32 out_1 = *prev_1; s32 out_2 = *prev_2; s32 val; if (!in) in += 1; if (!out_1) { val = out_1 = in; goto out; } val = MINSTREL_AVG_COEFF1 * in; val += MINSTREL_AVG_COEFF2 * out_1; val += MINSTREL_AVG_COEFF3 * out_2; val >>= MINSTREL_SCALE; if (val > 1 << MINSTREL_SCALE) val = 1 << MINSTREL_SCALE; if (val < 0) val = 1; out: *prev_2 = out_1; *prev_1 = val; return val; } /* * Recalculate statistics and counters of a given rate */ static void minstrel_ht_calc_rate_stats(struct minstrel_priv *mp, struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); minstrel_filter_avg_add(&mrs->prob_avg, &mrs->prob_avg_1, cur_prob); mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; } mrs->last_success = mrs->success; mrs->last_attempts = mrs->attempts; mrs->success = 0; mrs->attempts = 0; } static bool minstrel_ht_find_sample_rate(struct minstrel_ht_sta *mi, int type, int idx) { int i; for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { u16 cur = mi->sample[type].sample_rates[i]; if (cur == idx) return true; if (!cur) break; } return false; } static int minstrel_ht_move_sample_rates(struct minstrel_ht_sta *mi, int type, u32 fast_rate_dur, u32 slow_rate_dur) { u16 *rates = mi->sample[type].sample_rates; int i, j; for (i = 0, j = 0; i < MINSTREL_SAMPLE_RATES; i++) { u32 duration; bool valid = false; u16 cur; cur = rates[i]; if (!cur) continue; duration = minstrel_get_duration(cur); switch (type) { case MINSTREL_SAMPLE_TYPE_SLOW: valid = duration > fast_rate_dur && duration < slow_rate_dur; break; case MINSTREL_SAMPLE_TYPE_INC: case MINSTREL_SAMPLE_TYPE_JUMP: valid = duration < fast_rate_dur; break; default: valid = false; break; } if (!valid) { rates[i] = 0; continue; } if (i == j) continue; rates[j++] = cur; rates[i] = 0; } return j; } static int minstrel_ht_group_min_rate_offset(struct minstrel_ht_sta *mi, int group, u32 max_duration) { u16 supported = mi->supported[group]; int i; for (i = 0; i < MCS_GROUP_RATES && supported; i++, supported >>= 1) { if (!(supported & BIT(0))) continue; if (minstrel_get_duration(MI_RATE(group, i)) >= max_duration) continue; return i; } return -1; } /* * Incremental update rates: * Flip through groups and pick the first group rate that is faster than the * highest currently selected rate */ static u16 minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur) { u8 type = MINSTREL_SAMPLE_TYPE_INC; int i, index = 0; u8 group; group = mi->sample[type].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); index = minstrel_ht_group_min_rate_offset(mi, group, fast_rate_dur); if (index < 0) continue; index = MI_RATE(group, index & 0xf); if (!minstrel_ht_find_sample_rate(mi, type, index)) goto out; } index = 0; out: mi->sample[type].sample_group = group; return index; } static int minstrel_ht_next_group_sample_rate(struct minstrel_ht_sta *mi, int group, u16 supported, int offset) { struct minstrel_mcs_group_data *mg = &mi->groups[group]; u16 idx; int i; for (i = 0; i < MCS_GROUP_RATES; i++) { idx = sample_table[mg->column][mg->index]; if (++mg->index >= MCS_GROUP_RATES) { mg->index = 0; if (++mg->column >= ARRAY_SIZE(sample_table)) mg->column = 0; } if (idx < offset) continue; if (!(supported & BIT(idx))) continue; return MI_RATE(group, idx); } return -1; } /* * Jump rates: * Sample random rates, use those that are faster than the highest * currently selected rate. Rates between the fastest and the slowest * get sorted into the slow sample bucket, but only if it has room */ static u16 minstrel_ht_next_jump_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur, u32 slow_rate_dur, int *slow_rate_ofs) { struct minstrel_rate_stats *mrs; u32 max_duration = slow_rate_dur; int i, index, offset; u16 *slow_rates; u16 supported; u32 duration; u8 group; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) max_duration = fast_rate_dur; slow_rates = mi->sample[MINSTREL_SAMPLE_TYPE_SLOW].sample_rates; group = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { u8 type; group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); supported = mi->supported[group]; if (!supported) continue; offset = minstrel_ht_group_min_rate_offset(mi, group, max_duration); if (offset < 0) continue; index = minstrel_ht_next_group_sample_rate(mi, group, supported, offset); if (index < 0) continue; duration = minstrel_get_duration(index); if (duration < fast_rate_dur) type = MINSTREL_SAMPLE_TYPE_JUMP; else type = MINSTREL_SAMPLE_TYPE_SLOW; if (minstrel_ht_find_sample_rate(mi, type, index)) continue; if (type == MINSTREL_SAMPLE_TYPE_JUMP) goto found; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) continue; if (duration >= slow_rate_dur) continue; /* skip slow rates with high success probability */ mrs = minstrel_get_ratestats(mi, index); if (mrs->prob_avg > MINSTREL_FRAC(95, 100)) continue; slow_rates[(*slow_rate_ofs)++] = index; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) max_duration = fast_rate_dur; } index = 0; found: mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group = group; return index; } static void minstrel_ht_refill_sample_rates(struct minstrel_ht_sta *mi) { u32 prob_dur = minstrel_get_duration(mi->max_prob_rate); u32 tp_dur = minstrel_get_duration(mi->max_tp_rate[0]); u32 tp2_dur = minstrel_get_duration(mi->max_tp_rate[1]); u32 fast_rate_dur = min(min(tp_dur, tp2_dur), prob_dur); u32 slow_rate_dur = max(max(tp_dur, tp2_dur), prob_dur); u16 *rates; int i, j; rates = mi->sample[MINSTREL_SAMPLE_TYPE_INC].sample_rates; i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_INC, fast_rate_dur, slow_rate_dur); while (i < MINSTREL_SAMPLE_RATES) { rates[i] = minstrel_ht_next_inc_rate(mi, tp_dur); if (!rates[i]) break; i++; } rates = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_rates; i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_JUMP, fast_rate_dur, slow_rate_dur); j = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_SLOW, fast_rate_dur, slow_rate_dur); while (i < MINSTREL_SAMPLE_RATES) { rates[i] = minstrel_ht_next_jump_rate(mi, fast_rate_dur, slow_rate_dur, &j); if (!rates[i]) break; i++; } for (i = 0; i < ARRAY_SIZE(mi->sample); i++) memcpy(mi->sample[i].cur_sample_rates, mi->sample[i].sample_rates, sizeof(mi->sample[i].cur_sample_rates)); } /* * Update rate statistics and select new primary rates * * Rules for rate selection: * - max_prob_rate must use only one stream, as a tradeoff between delivery * probability and throughput during strong fluctuations * - as long as the max prob rate has a probability of more than 75%, pick * higher throughput rates, even if the probability is a bit lower */ static void minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int group, i, j, cur_prob; u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; u16 tmp_legacy_tp_rate[MAX_THR_RATES], tmp_max_prob_rate; u16 index; bool ht_supported = mi->sta->deflink.ht_cap.ht_supported; if (mi->ampdu_packets > 0) { if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN)) mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL); else mi->avg_ampdu_len = 0; mi->ampdu_len = 0; mi->ampdu_packets = 0; } if (mi->supported[MINSTREL_CCK_GROUP]) group = MINSTREL_CCK_GROUP; else if (mi->supported[MINSTREL_OFDM_GROUP]) group = MINSTREL_OFDM_GROUP; else group = 0; index = MI_RATE(group, 0); for (j = 0; j < ARRAY_SIZE(tmp_legacy_tp_rate); j++) tmp_legacy_tp_rate[j] = index; if (mi->supported[MINSTREL_VHT_GROUP_0]) group = MINSTREL_VHT_GROUP_0; else if (ht_supported) group = MINSTREL_HT_GROUP_0; else if (mi->supported[MINSTREL_CCK_GROUP]) group = MINSTREL_CCK_GROUP; else group = MINSTREL_OFDM_GROUP; index = MI_RATE(group, 0); tmp_max_prob_rate = index; for (j = 0; j < ARRAY_SIZE(tmp_mcs_tp_rate); j++) tmp_mcs_tp_rate[j] = index; /* Find best rate sets within all MCS groups*/ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { u16 *tp_rate = tmp_mcs_tp_rate; u16 last_prob = 0; mg = &mi->groups[group]; if (!mi->supported[group]) continue; /* (re)Initialize group rate indexes */ for(j = 0; j < MAX_THR_RATES; j++) tmp_group_tp_rate[j] = MI_RATE(group, 0); if (group == MINSTREL_CCK_GROUP && ht_supported) tp_rate = tmp_legacy_tp_rate; for (i = MCS_GROUP_RATES - 1; i >= 0; i--) { if (!(mi->supported[group] & BIT(i))) continue; index = MI_RATE(group, i); mrs = &mg->rates[i]; mrs->retry_updated = false; minstrel_ht_calc_rate_stats(mp, mrs); if (mrs->att_hist) last_prob = max(last_prob, mrs->prob_avg); else mrs->prob_avg = max(last_prob, mrs->prob_avg); cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; /* Find max throughput rate set */ minstrel_ht_sort_best_tp_rates(mi, index, tp_rate); /* Find max throughput rate set within a group */ minstrel_ht_sort_best_tp_rates(mi, index, tmp_group_tp_rate); } memcpy(mg->max_group_tp_rate, tmp_group_tp_rate, sizeof(mg->max_group_tp_rate)); } /* Assign new rate set per sta */ minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_legacy_tp_rate); memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate)); for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { if (!mi->supported[group]) continue; mg = &mi->groups[group]; mg->max_group_prob_rate = MI_RATE(group, 0); for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mi->supported[group] & BIT(i))) continue; index = MI_RATE(group, i); /* Find max probability rate per group and global */ minstrel_ht_set_best_prob_rate(mi, &tmp_max_prob_rate, index); } } mi->max_prob_rate = tmp_max_prob_rate; /* Try to increase robustness of max_prob_rate*/ minstrel_ht_prob_rate_reduce_streams(mi); minstrel_ht_refill_sample_rates(mi); #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ if (mp->fixed_rate_idx != -1) { for (i = 0; i < 4; i++) mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; } #endif /* Reset update timer */ mi->last_stats_update = jiffies; mi->sample_time = jiffies; } static bool minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_tx_rate *rate) { int i; if (rate->idx < 0) return false; if (!rate->count) return false; if (rate->flags & IEEE80211_TX_RC_MCS || rate->flags & IEEE80211_TX_RC_VHT_MCS) return true; for (i = 0; i < ARRAY_SIZE(mp->cck_rates); i++) if (rate->idx == mp->cck_rates[i]) return true; for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) if (rate->idx == mp->ofdm_rates[mi->band][i]) return true; return false; } /* * Check whether rate_status contains valid information. */ static bool minstrel_ht_ri_txstat_valid(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_rate_status *rate_status) { int i; if (!rate_status) return false; if (!rate_status->try_count) return false; if (rate_status->rate_idx.flags & RATE_INFO_FLAGS_MCS || rate_status->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS) return true; for (i = 0; i < ARRAY_SIZE(mp->cck_rates); i++) { if (rate_status->rate_idx.legacy == minstrel_cck_bitrates[ mp->cck_rates[i] ]) return true; } for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates); i++) { if (rate_status->rate_idx.legacy == minstrel_ofdm_bitrates[ mp->ofdm_rates[mi->band][i] ]) return true; } return false; } static void minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) { int group, orig_group; orig_group = group = MI_RATE_GROUP(*idx); while (group > 0) { group--; if (!mi->supported[group]) continue; if (minstrel_mcs_groups[group].streams > minstrel_mcs_groups[orig_group].streams) continue; if (primary) *idx = mi->groups[group].max_group_tp_rate[0]; else *idx = mi->groups[group].max_group_tp_rate[1]; break; } } static void minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st) { struct ieee80211_tx_info *info = st->info; struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2; struct minstrel_priv *mp = priv; u32 update_interval = mp->update_interval; bool last, update = false; int i; /* Ignore packet that was sent with noAck flag */ if (info->flags & IEEE80211_TX_CTL_NO_ACK) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) { info->status.ampdu_ack_len = (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0); info->status.ampdu_len = 1; } /* wraparound */ if (mi->total_packets >= ~0 - info->status.ampdu_len) { mi->total_packets = 0; mi->sample_packets = 0; } mi->total_packets += info->status.ampdu_len; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) mi->sample_packets += info->status.ampdu_len; mi->ampdu_packets++; mi->ampdu_len += info->status.ampdu_len; if (st->rates && st->n_rates) { last = !minstrel_ht_ri_txstat_valid(mp, mi, &(st->rates[0])); for (i = 0; !last; i++) { last = (i == st->n_rates - 1) || !minstrel_ht_ri_txstat_valid(mp, mi, &(st->rates[i + 1])); rate = minstrel_ht_ri_get_stats(mp, mi, &(st->rates[i])); if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += st->rates[i].try_count * info->status.ampdu_len; } } else { last = !minstrel_ht_txstat_valid(mp, mi, &ar[0]); for (i = 0; !last; i++) { last = (i == IEEE80211_TX_MAX_RATES - 1) || !minstrel_ht_txstat_valid(mp, mi, &ar[i + 1]); rate = minstrel_ht_get_stats(mp, mi, &ar[i]); if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += ar[i].count * info->status.ampdu_len; } } if (mp->hw->max_rates > 1) { /* * check for sudden death of spatial multiplexing, * downgrade to a lower number of streams if necessary. */ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && rate->success < rate->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && rate2->success < rate2->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } } if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; minstrel_ht_update_stats(mp, mi); } if (update) minstrel_ht_update_rates(mp, mi); } static void minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, int index) { struct minstrel_rate_stats *mrs; unsigned int tx_time, tx_time_rtscts, tx_time_data; unsigned int cw = mp->cw_min; unsigned int ctime = 0; unsigned int t_slot = 9; /* FIXME */ unsigned int ampdu_len = minstrel_ht_avg_ampdu_len(mi); unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; } mrs->retry_count = 2; mrs->retry_count_rtscts = 2; mrs->retry_updated = true; tx_time_data = minstrel_get_duration(index) * ampdu_len / 1000; /* Contention time for first 2 tries */ ctime = (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); ctime += (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index))) { overhead = mi->overhead_legacy; overhead_rtscts = mi->overhead_legacy_rtscts; } else { overhead = mi->overhead; overhead_rtscts = mi->overhead_rtscts; } /* Total TX time for data and Contention after first 2 tries */ tx_time = ctime + 2 * (overhead + tx_time_data); tx_time_rtscts = ctime + 2 * (overhead_rtscts + tx_time_data); /* See how many more tries we can fit inside segment size */ do { /* Contention time for this try */ ctime = (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); /* Total TX time after this try */ tx_time += ctime + overhead + tx_time_data; tx_time_rtscts += ctime + overhead_rtscts + tx_time_data; if (tx_time_rtscts < mp->segment_size) mrs->retry_count_rtscts++; } while ((tx_time < mp->segment_size) && (++mrs->retry_count < mp->max_retry)); } static void minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_sta_rates *ratetbl, int offset, int index) { int group_idx = MI_RATE_GROUP(index); const struct mcs_group *group = &minstrel_mcs_groups[group_idx]; struct minstrel_rate_stats *mrs; u8 idx; u16 flags = group->flags; mrs = minstrel_get_ratestats(mi, index); if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; } else { ratetbl->rate[offset].count = mrs->retry_count; ratetbl->rate[offset].count_cts = mrs->retry_count; ratetbl->rate[offset].count_rts = mrs->retry_count_rtscts; } index = MI_RATE_IDX(index); if (group_idx == MINSTREL_CCK_GROUP) idx = mp->cck_rates[index % ARRAY_SIZE(mp->cck_rates)]; else if (group_idx == MINSTREL_OFDM_GROUP) idx = mp->ofdm_rates[mi->band][index % ARRAY_SIZE(mp->ofdm_rates[0])]; else if (flags & IEEE80211_TX_RC_VHT_MCS) idx = ((group->streams - 1) << 4) | (index & 0xF); else idx = index + (group->streams - 1) * 8; /* enable RTS/CTS if needed: * - if station is in dynamic SMPS (and streams > 1) * - for fallback rates, to increase chances of getting through */ if (offset > 0 || (mi->sta->deflink.smps_mode == IEEE80211_SMPS_DYNAMIC && group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; flags |= IEEE80211_TX_RC_USE_RTS_CTS; } ratetbl->rate[offset].idx = idx; ratetbl->rate[offset].flags = flags; } static inline int minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = MI_RATE_GROUP(rate); rate = MI_RATE_IDX(rate); return mi->groups[group].rates[rate].prob_avg; } static int minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) { int group = MI_RATE_GROUP(mi->max_prob_rate); const struct mcs_group *g = &minstrel_mcs_groups[group]; int rate = MI_RATE_IDX(mi->max_prob_rate); unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; duration <<= g->shift; /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */ if (duration > MCS_DURATION(1, 0, 52)) return 500; /* * If the rate is slower than single-stream MCS4, limit A-MSDU to usual * data packet size */ if (duration > MCS_DURATION(1, 0, 104)) return 1600; /* * If the rate is slower than single-stream MCS7, or if the max throughput * rate success probability is less than 75%, limit A-MSDU to twice the usual * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; /* * HT A-MPDU limits maximum MPDU size under BA agreement to 4095 bytes. * Since aggregation sessions are started/stopped without txq flush, use * the limit here to avoid the complexity of having to de-aggregate * packets in the queue. */ if (!mi->sta->deflink.vht_cap.vht_supported) return IEEE80211_MAX_MPDU_LEN_HT_BA; /* unlimited */ return 0; } static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct ieee80211_sta_rates *rates; int i = 0; int max_rates = min_t(int, mp->hw->max_rates, IEEE80211_TX_RATE_TABLE_SIZE); rates = kzalloc(sizeof(*rates), GFP_ATOMIC); if (!rates) return; /* Start with max_tp_rate[0] */ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); /* Fill up remaining, keep one entry for max_probe_rate */ for (; i < (max_rates - 1); i++) minstrel_ht_set_rate(mp, mi, rates, i, mi->max_tp_rate[i]); if (i < max_rates) minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate); if (i < IEEE80211_TX_RATE_TABLE_SIZE) rates->rate[i].idx = -1; mi->sta->deflink.agg.max_rc_amsdu_len = minstrel_ht_get_max_amsdu_len(mi); ieee80211_sta_recalc_aggregates(mi->sta); rate_control_set_rates(mp->hw, mi->sta, rates); } static u16 minstrel_ht_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { u8 seq; if (mp->hw->max_rates > 1) { seq = mi->sample_seq; mi->sample_seq = (seq + 1) % ARRAY_SIZE(minstrel_sample_seq); seq = minstrel_sample_seq[seq]; } else { seq = MINSTREL_SAMPLE_TYPE_INC; } return __minstrel_ht_get_sample_rate(mi, seq); } static void minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc) { const struct mcs_group *sample_group; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_tx_rate *rate = &info->status.rates[0]; struct minstrel_ht_sta *mi = priv_sta; struct minstrel_priv *mp = priv; u16 sample_idx; info->flags |= mi->tx_flags; #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) return; #endif /* Don't use EAPOL frames for sampling on non-mrr hw */ if (mp->hw->max_rates == 1 && (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) return; if (time_is_after_jiffies(mi->sample_time)) return; mi->sample_time = jiffies + MINSTREL_SAMPLE_INTERVAL; sample_idx = minstrel_ht_get_sample_rate(mp, mi); if (!sample_idx) return; sample_group = &minstrel_mcs_groups[MI_RATE_GROUP(sample_idx)]; sample_idx = MI_RATE_IDX(sample_idx); if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] && (sample_idx >= 4) != txrc->short_preamble) return; info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; rate->count = 1; if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->cck_rates); rate->idx = mp->cck_rates[idx]; } else if (sample_group == &minstrel_mcs_groups[MINSTREL_OFDM_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->ofdm_rates[0]); rate->idx = mp->ofdm_rates[mi->band][idx]; } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) { ieee80211_rate_set_vht(rate, MI_RATE_IDX(sample_idx), sample_group->streams); } else { rate->idx = sample_idx + (sample_group->streams - 1) * 8; } rate->flags = sample_group->flags; } static void minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { int i; if (sband->band != NL80211_BAND_2GHZ) return; if (sta->deflink.ht_cap.ht_supported && !ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; for (i = 0; i < 4; i++) { if (mp->cck_rates[i] == 0xff || !rate_supported(sta, sband->band, mp->cck_rates[i])) continue; mi->supported[MINSTREL_CCK_GROUP] |= BIT(i); if (sband->bitrates[i].flags & IEEE80211_RATE_SHORT_PREAMBLE) mi->supported[MINSTREL_CCK_GROUP] |= BIT(i + 4); } } static void minstrel_ht_update_ofdm(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { const u8 *rates; int i; if (sta->deflink.ht_cap.ht_supported) return; rates = mp->ofdm_rates[sband->band]; for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) { if (rates[i] == 0xff || !rate_supported(sta, sband->band, rates[i])) continue; mi->supported[MINSTREL_OFDM_GROUP] |= BIT(i); } } static void minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { struct minstrel_priv *mp = priv; struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_mcs_info *mcs = &sta->deflink.ht_cap.mcs; u16 ht_cap = sta->deflink.ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; const struct ieee80211_rate *ctl_rate; struct sta_info *sta_info; bool ldpc, erp; int use_vht; int ack_dur; int stbc; int i; BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB); if (vht_cap->vht_supported) use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0); else use_vht = 0; memset(mi, 0, sizeof(*mi)); mi->sta = sta; mi->band = sband->band; mi->last_stats_update = jiffies; ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1); mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1); mi->overhead += ack_dur; mi->overhead_rtscts = mi->overhead + 2 * ack_dur; ctl_rate = &sband->bitrates[rate_lowest_index(sband, sta)]; erp = ctl_rate->flags & IEEE80211_RATE_ERP_G; ack_dur = ieee80211_frame_duration(sband->band, 10, ctl_rate->bitrate, erp, 1); mi->overhead_legacy = ack_dur; mi->overhead_legacy_rtscts = mi->overhead_legacy + 2 * ack_dur; mi->avg_ampdu_len = MINSTREL_FRAC(1, 1); if (!use_vht) { stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >> IEEE80211_HT_CAP_RX_STBC_SHIFT; ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING; } else { stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >> IEEE80211_VHT_CAP_RXSTBC_SHIFT; ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC; } mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT; if (ldpc) mi->tx_flags |= IEEE80211_TX_CTL_LDPC; for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { u32 gflags = minstrel_mcs_groups[i].flags; int bw, nss; mi->supported[i] = 0; if (minstrel_ht_is_legacy_group(i)) continue; if (gflags & IEEE80211_TX_RC_SHORT_GI) { if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) { if (!(ht_cap & IEEE80211_HT_CAP_SGI_40)) continue; } else { if (!(ht_cap & IEEE80211_HT_CAP_SGI_20)) continue; } } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH && sta->deflink.bandwidth < IEEE80211_STA_RX_BW_40) continue; nss = minstrel_mcs_groups[i].streams; /* Mark MCS > 7 as unsupported if STA is in static SMPS mode */ if (sta->deflink.smps_mode == IEEE80211_SMPS_STATIC && nss > 1) continue; /* HT rate */ if (gflags & IEEE80211_TX_RC_MCS) { if (use_vht && minstrel_vht_only) continue; mi->supported[i] = mcs->rx_mask[nss - 1]; continue; } /* VHT rate */ if (!vht_cap->vht_supported || WARN_ON(!(gflags & IEEE80211_TX_RC_VHT_MCS)) || WARN_ON(gflags & IEEE80211_TX_RC_160_MHZ_WIDTH)) continue; if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) { if (sta->deflink.bandwidth < IEEE80211_STA_RX_BW_80 || ((gflags & IEEE80211_TX_RC_SHORT_GI) && !(vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_80))) { continue; } } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) bw = BW_40; else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) bw = BW_80; else bw = BW_20; mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss, vht_cap->vht_mcs.tx_mcs_map); } sta_info = container_of(sta, struct sta_info, sta); mi->use_short_preamble = test_sta_flag(sta_info, WLAN_STA_SHORT_PREAMBLE) && sta_info->sdata->vif.bss_conf.use_short_preamble; minstrel_ht_update_cck(mp, mi, sband, sta); minstrel_ht_update_ofdm(mp, mi, sband, sta); /* create an initial rate table with the lowest supported rates */ minstrel_ht_update_stats(mp, mi); minstrel_ht_update_rates(mp, mi); } static void minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed) { minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void * minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) { struct ieee80211_supported_band *sband; struct minstrel_ht_sta *mi; struct minstrel_priv *mp = priv; struct ieee80211_hw *hw = mp->hw; int max_rates = 0; int i; for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; } return kzalloc(sizeof(*mi), gfp); } static void minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) { kfree(priv_sta); } static void minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, const s16 *bitrates, int n_rates) { int i, j; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; dest[j] = i; break; } } } static void minstrel_ht_init_cck_rates(struct minstrel_priv *mp) { static const s16 bitrates[4] = { 10, 20, 55, 110 }; struct ieee80211_supported_band *sband; memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; if (!sband) return; BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->cck_rates, sband, minstrel_cck_bitrates, ARRAY_SIZE(minstrel_cck_bitrates)); } static void minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) { static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; struct ieee80211_supported_band *sband; memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); sband = mp->hw->wiphy->bands[band]; if (!sband) return; BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, minstrel_ofdm_bitrates, ARRAY_SIZE(minstrel_ofdm_bitrates)); } static void * minstrel_ht_alloc(struct ieee80211_hw *hw) { struct minstrel_priv *mp; int i; mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); if (!mp) return NULL; /* contention window settings * Just an approximation. Using the per-queue values would complicate * the calculations and is probably unnecessary */ mp->cw_min = 15; mp->cw_max = 1023; /* maximum time that the hw is allowed to stay in one MRR segment */ mp->segment_size = 6000; if (hw->max_rate_tries > 0) mp->max_retry = hw->max_rate_tries; else /* safe default, does not necessarily have to match hw properties */ mp->max_retry = 7; mp->hw = hw; mp->update_interval = HZ / 20; minstrel_ht_init_cck_rates(mp); for (i = 0; i < ARRAY_SIZE(mp->hw->wiphy->bands); i++) minstrel_ht_init_ofdm_rates(mp, i); return mp; } #ifdef CONFIG_MAC80211_DEBUGFS static void minstrel_ht_add_debugfs(struct ieee80211_hw *hw, void *priv, struct dentry *debugfsdir) { struct minstrel_priv *mp = priv; mp->fixed_rate_idx = (u32) -1; debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); } #endif static void minstrel_ht_free(void *priv) { kfree(priv); } static u32 minstrel_ht_get_expected_throughput(void *priv_sta) { struct minstrel_ht_sta *mi = priv_sta; int i, j, prob, tp_avg; i = MI_RATE_GROUP(mi->max_tp_rate[0]); j = MI_RATE_IDX(mi->max_tp_rate[0]); prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024; return tp_avg; } static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", .capa = RATE_CTRL_CAPA_AMPDU_TRIGGER, .tx_status_ext = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, .rate_init = minstrel_ht_rate_init, .rate_update = minstrel_ht_rate_update, .alloc_sta = minstrel_ht_alloc_sta, .free_sta = minstrel_ht_free_sta, .alloc = minstrel_ht_alloc, .free = minstrel_ht_free, #ifdef CONFIG_MAC80211_DEBUGFS .add_debugfs = minstrel_ht_add_debugfs, .add_sta_debugfs = minstrel_ht_add_sta_debugfs, #endif .get_expected_throughput = minstrel_ht_get_expected_throughput, }; static void __init init_sample_table(void) { int col, i, new_idx; u8 rnd[MCS_GROUP_RATES]; memset(sample_table, 0xff, sizeof(sample_table)); for (col = 0; col < SAMPLE_COLUMNS; col++) { get_random_bytes(rnd, sizeof(rnd)); for (i = 0; i < MCS_GROUP_RATES; i++) { new_idx = (i + rnd[i]) % MCS_GROUP_RATES; while (sample_table[col][new_idx] != 0xff) new_idx = (new_idx + 1) % MCS_GROUP_RATES; sample_table[col][new_idx] = i; } } } int __init rc80211_minstrel_init(void) { init_sample_table(); return ieee80211_rate_control_register(&mac80211_minstrel_ht); } void rc80211_minstrel_exit(void) { ieee80211_rate_control_unregister(&mac80211_minstrel_ht); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ /* * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 * * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. */ #ifndef BITMAP_H #define BITMAP_H 1 #define BITMAP_MAGIC 0x6d746962 /* * version 3 is host-endian order, this is deprecated and not used for new * array */ #define BITMAP_MAJOR_LO 3 #define BITMAP_MAJOR_HOSTENDIAN 3 /* version 4 is little-endian order, the default value */ #define BITMAP_MAJOR_HI 4 /* version 5 is only used for cluster */ #define BITMAP_MAJOR_CLUSTERED 5 /* version 6 is only used for lockless bitmap */ #define BITMAP_MAJOR_LOCKLESS 6 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ BITMAP_FIRST_USE = 3, /* llbitmap is just created */ BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ BITMAP_HOSTENDIAN =15, }; /* the superblock at the front of the bitmap file -- little endian */ typedef struct bitmap_super_s { __le32 magic; /* 0 BITMAP_MAGIC */ __le32 version; /* 4 the bitmap major for now, could change... */ __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ __le64 events; /* 24 event counter for the bitmap (1)*/ __le64 events_cleared;/*32 event counter when last bit cleared (2) */ __le64 sync_size; /* 40 the size of the md device's sync range(3) */ __le32 state; /* 48 bitmap state information */ __le32 chunksize; /* 52 the bitmap chunk size in bytes */ __le32 daemon_sleep; /* 56 seconds between disk flushes */ __le32 write_behind; /* 60 number of outstanding write-behind writes */ __le32 sectors_reserved; /* 64 number of 512-byte sectors that are * reserved for the bitmap. */ __le32 nodes; /* 68 the maximum number of nodes in cluster. */ __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ __u8 pad[256 - 136]; /* set to zero */ } bitmap_super_t; /* notes: * (1) This event counter is updated before the eventcounter in the md superblock * When a bitmap is loaded, it is only accepted if this event counter is equal * to, or one greater than, the event counter in the superblock. * (2) This event counter is updated when the other one is *if*and*only*if* the * array is not degraded. As bits are not cleared when the array is degraded, * this represents the last time that any bits were cleared. * If a device is being added that has an event count with this value or * higher, it is accepted as conforming to the bitmap. * (3)This is the number of sectors represented by the bitmap, and is the range that * resync happens across. For raid1 and raid5/6 it is the size of individual * devices. For raid10 it is the size of the array. */ struct md_bitmap_stats { u64 events_cleared; int behind_writes; bool behind_wait; unsigned long missing_pages; unsigned long file_pages; unsigned long sync_size; unsigned long pages; struct file *file; }; typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset, unsigned long sectors); struct bitmap_operations { struct md_submodule_head head; bool (*enabled)(void *data, bool flush); int (*create)(struct mddev *mddev); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); void (*flush)(struct mddev *mddev); void (*write_all)(struct mddev *mddev); void (*dirty_bits)(struct mddev *mddev, unsigned long s, unsigned long e); void (*unplug)(struct mddev *mddev, bool sync); void (*daemon_work)(struct mddev *mddev); void (*start_behind_write)(struct mddev *mddev); void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); md_bitmap_fn *start_write; md_bitmap_fn *end_write; md_bitmap_fn *start_discard; md_bitmap_fn *end_discard; sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); bool (*blocks_synced)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force); void (*close_sync)(struct mddev *mddev); void (*update_sb)(void *data); int (*get_stats)(void *data, struct md_bitmap_stats *stats); void (*sync_with_cluster)(struct mddev *mddev, sector_t old_lo, sector_t old_hi, sector_t new_lo, sector_t new_hi); void *(*get_from_slot)(struct mddev *mddev, int slot); int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void (*set_pages)(void *data, unsigned long pages); void (*free)(void *data); struct attribute_group *group; }; /* the bitmap API */ static inline bool md_bitmap_registered(struct mddev *mddev) { return mddev->bitmap_ops != NULL; } static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) { /* bitmap_ops must be registered before creating bitmap. */ if (!md_bitmap_registered(mddev)) return false; if (!mddev->bitmap) return false; return mddev->bitmap_ops->enabled(mddev->bitmap, flush); } static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded) { /* always resync if no bitmap */ if (!md_bitmap_enabled(mddev, false)) { *blocks = 1024; return true; } return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); } static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, sector_t *blocks) { if (!md_bitmap_enabled(mddev, false)) { *blocks = 1024; return; } mddev->bitmap_ops->end_sync(mddev, offset, blocks); } #ifdef CONFIG_MD_BITMAP int md_bitmap_init(void); void md_bitmap_exit(void); #else static inline int md_bitmap_init(void) { return 0; } static inline void md_bitmap_exit(void) { } #endif #ifdef CONFIG_MD_LLBITMAP int md_llbitmap_init(void); void md_llbitmap_exit(void); #else static inline int md_llbitmap_init(void) { return 0; } static inline void md_llbitmap_exit(void) { } #endif #endif
3 3 3 3 3 3 3 6 6 5 2 5 2 5 2 2 4 2 2 4 4 23 7 7 17 17 23 18 16 16 3 14 14 14 13 13 13 4 14 14 16 16 12 12 2 28 12 1 28 28 33 32 33 32 33 33 33 31 31 31 28 23 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mincore.c * * Copyright (C) 1994-2006 Linus Torvalds */ /* * The mincore() system call. */ #include <linux/pagemap.h> #include <linux/gfp.h> #include <linux/pagewalk.h> #include <linux/mman.h> #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> #include <linux/uaccess.h> #include "swap.h" #include "internal.h" static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE unsigned char present; unsigned char *vec = walk->private; spinlock_t *ptl; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ if (!pte) { present = 0; } else { const pte_t ptep = huge_ptep_get(walk->mm, addr, pte); if (huge_pte_none(ptep) || pte_is_marker(ptep)) present = 0; else present = 1; } for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; spin_unlock(ptl); #else BUG(); #endif return 0; } static unsigned char mincore_swap(swp_entry_t entry, bool shmem) { struct swap_info_struct *si; struct folio *folio = NULL; unsigned char present = 0; if (!IS_ENABLED(CONFIG_SWAP)) { WARN_ON(1); return 0; } /* * Shmem mapping may contain swapin error entries, which are * absent. Page table may contain migration or hwpoison * entries which are always uptodate. */ if (!softleaf_is_swap(entry)) return !shmem; /* * Shmem mapping lookup is lockless, so we need to grab the swap * device. mincore page table walk locks the PTL, and the swap * device is stable, avoid touching the si for better performance. */ if (shmem) { si = get_swap_device(entry); if (!si) return 0; } folio = swap_cache_get_folio(entry); if (shmem) put_swap_device(si); /* The swap cache space contains either folio, shadow or NULL */ if (folio && !xa_is_value(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } return present; } /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; struct folio *folio; /* * When tmpfs swaps out a page from a file, any process mapping that * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ folio = filemap_get_entry(mapping, index); if (folio) { if (xa_is_value(folio)) { if (shmem_mapping(mapping)) return mincore_swap(radix_to_swp_entry(folio), true); else return 0; } present = folio_test_uptodate(folio); folio_put(folio); } return present; } static int __mincore_unmapped_range(unsigned long addr, unsigned long end, struct vm_area_struct *vma, unsigned char *vec) { unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; if (vma->vm_file) { pgoff_t pgoff; pgoff = linear_page_index(vma, addr); for (i = 0; i < nr; i++, pgoff++) vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); } else { for (i = 0; i < nr; i++) vec[i] = 0; } return nr; } static int mincore_unmapped_range(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { walk->private += __mincore_unmapped_range(addr, end, walk->vma, walk->private); return 0; } static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { spinlock_t *ptl; struct vm_area_struct *vma = walk->vma; pte_t *ptep; unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; int step, i; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { memset(vec, 1, nr); spin_unlock(ptl); goto out; } ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!ptep) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; ptep += step, addr += step * PAGE_SIZE) { pte_t pte = ptep_get(ptep); step = 1; /* We need to do cache lookup too for markers */ if (pte_none(pte) || pte_is_marker(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { unsigned int batch = pte_batch_hint(ptep, pte); if (batch > 1) { unsigned int max_nr = (end - addr) >> PAGE_SHIFT; step = min_t(unsigned int, batch, max_nr); } for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ const softleaf_t entry = softleaf_from_pte(pte); *vec = mincore_swap(entry, false); } vec += step; } pte_unmap_unlock(ptep - 1, ptl); out: walk->private += nr; cond_resched(); return 0; } static inline bool can_do_mincore(struct vm_area_struct *vma) { if (vma_is_anonymous(vma)) return true; if (!vma->vm_file) return false; /* * Reveal pagecache information only for non-anonymous mappings that * correspond to the files the calling process could (if tried) open * for writing; otherwise we'd be including shared non-exclusive * mappings, which opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static const struct mm_walk_ops mincore_walk_ops = { .pmd_entry = mincore_pte_range, .pte_hole = mincore_unmapped_range, .hugetlb_entry = mincore_hugetlb, .walk_lock = PGWALK_RDLOCK, }; /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should * just return the amount of info we're asked for. */ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) { struct vm_area_struct *vma; unsigned long end; int err; vma = vma_lookup(current->mm, addr); if (!vma) return -ENOMEM; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); if (!can_do_mincore(vma)) { unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); memset(vec, 1, pages); return pages; } err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); if (err < 0) return err; return (end - addr) >> PAGE_SHIFT; } /* * The mincore(2) system call. * * mincore() returns the memory residency status of the pages in the * current process's address space specified by [addr, addr + len). * The status is returned in a vector of bytes. The least significant * bit of each byte is 1 if the referenced page is in memory, otherwise * it is zero. * * Because the status of a page can change after mincore() checks it * but before it returns to the application, the returned vector may * contain stale information. Only locked pages are guaranteed to * remain in memory. * * return values: * zero - success * -EFAULT - vec points to an illegal address * -EINVAL - addr is not a multiple of PAGE_SIZE * -ENOMEM - Addresses in the range [addr, addr + len] are * invalid for the address space of this process, or * specify one or more pages which are not currently * mapped * -EAGAIN - A kernel resource was temporarily unavailable. */ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned char __user *, vec) { long retval; unsigned long pages; unsigned char *tmp; start = untagged_addr(start); /* Check the start address: needs to be page-aligned.. */ if (unlikely(start & ~PAGE_MASK)) return -EINVAL; /* ..and we need to be passed a valid user-space range */ if (!access_ok((void __user *) start, len)) return -ENOMEM; /* This also avoids any overflows on PAGE_ALIGN */ pages = len >> PAGE_SHIFT; pages += (offset_in_page(len)) != 0; if (!access_ok(vec, pages)) return -EFAULT; tmp = (void *) __get_free_page(GFP_USER); if (!tmp) return -EAGAIN; retval = 0; while (pages) { /* * Do at most PAGE_SIZE entries per iteration, due to * the temporary buffer size. */ mmap_read_lock(current->mm); retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); mmap_read_unlock(current->mm); if (retval <= 0) break; if (copy_to_user(vec, tmp, retval)) { retval = -EFAULT; break; } pages -= retval; vec += retval; start += retval << PAGE_SHIFT; retval = 0; } free_page((unsigned long) tmp); return retval; }
39 38 6 4 4 4 4 4 3 3 3 3 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 // SPDX-License-Identifier: GPL-2.0-only /* * lib/parser.c - simple parser for mount, etc. options. */ #include <linux/ctype.h> #include <linux/types.h> #include <linux/export.h> #include <linux/kstrtox.h> #include <linux/parser.h> #include <linux/slab.h> #include <linux/string.h> /* * max size needed by different bases to express U64 * HEX: "0xFFFFFFFFFFFFFFFF" --> 18 * DEC: "18446744073709551615" --> 20 * OCT: "01777777777777777777777" --> 23 * pick the max one to define NUMBER_BUF_LEN */ #define NUMBER_BUF_LEN 24 /** * match_one - Determines if a string matches a simple pattern * @s: the string to examine for presence of the pattern * @p: the string containing the pattern * @args: array of %MAX_OPT_ARGS &substring_t elements. Used to return match * locations. * * Description: Determines if the pattern @p is present in string @s. Can only * match extremely simple token=arg style patterns. If the pattern is found, * the location(s) of the arguments will be returned in the @args array. */ static int match_one(char *s, const char *p, substring_t args[]) { char *meta; int argc = 0; if (!p) return 1; while(1) { int len = -1; meta = strchr(p, '%'); if (!meta) return strcmp(p, s) == 0; if (strncmp(p, s, meta-p)) return 0; s += meta - p; p = meta + 1; if (isdigit(*p)) len = simple_strtoul(p, (char **) &p, 10); else if (*p == '%') { if (*s++ != '%') return 0; p++; continue; } if (argc >= MAX_OPT_ARGS) return 0; args[argc].from = s; switch (*p++) { case 's': { size_t str_len = strlen(s); if (str_len == 0) return 0; if (len == -1 || len > str_len) len = str_len; args[argc].to = s + len; break; } case 'd': simple_strtol(s, &args[argc].to, 0); goto num; case 'u': simple_strtoul(s, &args[argc].to, 0); goto num; case 'o': simple_strtoul(s, &args[argc].to, 8); goto num; case 'x': simple_strtoul(s, &args[argc].to, 16); num: if (args[argc].to == args[argc].from) return 0; break; default: return 0; } s = args[argc].to; argc++; } } /** * match_token - Find a token (and optional args) in a string * @s: the string to examine for token/argument pairs * @table: match_table_t describing the set of allowed option tokens and the * arguments that may be associated with them. Must be terminated with a * &struct match_token whose pattern is set to the NULL pointer. * @args: array of %MAX_OPT_ARGS &substring_t elements. Used to return match * locations. * * Description: Detects which if any of a set of token strings has been passed * to it. Tokens can include up to %MAX_OPT_ARGS instances of basic c-style * format identifiers which will be taken into account when matching the * tokens, and whose locations will be returned in the @args array. */ int match_token(char *s, const match_table_t table, substring_t args[]) { const struct match_token *p; for (p = table; !match_one(s, p->pattern, args) ; p++) ; return p->token; } EXPORT_SYMBOL(match_token); /** * match_number - scan a number in the given base from a substring_t * @s: substring to be scanned * @result: resulting integer on success * @base: base to use when converting string * * Description: Given a &substring_t and a base, attempts to parse the substring * as a number in that base. * * Return: On success, sets @result to the integer represented by the * string and returns 0. Returns -EINVAL or -ERANGE on failure. */ static int match_number(substring_t *s, int *result, int base) { char *endp; char buf[NUMBER_BUF_LEN]; int ret; long val; if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) return -ERANGE; ret = 0; val = simple_strtol(buf, &endp, base); if (endp == buf) ret = -EINVAL; else if (val < (long)INT_MIN || val > (long)INT_MAX) ret = -ERANGE; else *result = (int) val; return ret; } /** * match_u64int - scan a number in the given base from a substring_t * @s: substring to be scanned * @result: resulting u64 on success * @base: base to use when converting string * * Description: Given a &substring_t and a base, attempts to parse the substring * as a number in that base. * * Return: On success, sets @result to the integer represented by the * string and returns 0. Returns -EINVAL or -ERANGE on failure. */ static int match_u64int(substring_t *s, u64 *result, int base) { char buf[NUMBER_BUF_LEN]; int ret; u64 val; if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) return -ERANGE; ret = kstrtoull(buf, base, &val); if (!ret) *result = val; return ret; } /** * match_int - scan a decimal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success * * Description: Attempts to parse the &substring_t @s as a decimal integer. * * Return: On success, sets @result to the integer represented by the string * and returns 0. Returns -EINVAL or -ERANGE on failure. */ int match_int(substring_t *s, int *result) { return match_number(s, result, 0); } EXPORT_SYMBOL(match_int); /** * match_uint - scan a decimal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success * * Description: Attempts to parse the &substring_t @s as a decimal integer. * * Return: On success, sets @result to the integer represented by the string * and returns 0. Returns -EINVAL or -ERANGE on failure. */ int match_uint(substring_t *s, unsigned int *result) { char buf[NUMBER_BUF_LEN]; if (match_strlcpy(buf, s, NUMBER_BUF_LEN) >= NUMBER_BUF_LEN) return -ERANGE; return kstrtouint(buf, 10, result); } EXPORT_SYMBOL(match_uint); /** * match_u64 - scan a decimal representation of a u64 from * a substring_t * @s: substring_t to be scanned * @result: resulting unsigned long long on success * * Description: Attempts to parse the &substring_t @s as a long decimal * integer. * * Return: On success, sets @result to the integer represented by the string * and returns 0. Returns -EINVAL or -ERANGE on failure. */ int match_u64(substring_t *s, u64 *result) { return match_u64int(s, result, 0); } EXPORT_SYMBOL(match_u64); /** * match_octal - scan an octal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success * * Description: Attempts to parse the &substring_t @s as an octal integer. * * Return: On success, sets @result to the integer represented by the string * and returns 0. Returns -EINVAL or -ERANGE on failure. */ int match_octal(substring_t *s, int *result) { return match_number(s, result, 8); } EXPORT_SYMBOL(match_octal); /** * match_hex - scan a hex representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success * * Description: Attempts to parse the &substring_t @s as a hexadecimal integer. * * Return: On success, sets @result to the integer represented by the string * and returns 0. Returns -EINVAL or -ERANGE on failure. */ int match_hex(substring_t *s, int *result) { return match_number(s, result, 16); } EXPORT_SYMBOL(match_hex); /** * match_wildcard - parse if a string matches given wildcard pattern * @pattern: wildcard pattern * @str: the string to be parsed * * Description: Parse the string @str to check if matches wildcard * pattern @pattern. The pattern may contain two types of wildcards: * * * '*' - matches zero or more characters * * '?' - matches one character * * Return: If the @str matches the @pattern, return true, else return false. */ bool match_wildcard(const char *pattern, const char *str) { const char *s = str; const char *p = pattern; bool star = false; while (*s) { switch (*p) { case '?': s++; p++; break; case '*': star = true; str = s; if (!*++p) return true; pattern = p; break; default: if (*s == *p) { s++; p++; } else { if (!star) return false; str++; s = str; p = pattern; } break; } } if (*p == '*') ++p; return !*p; } EXPORT_SYMBOL(match_wildcard); /** * match_strlcpy - Copy the characters from a substring_t to a sized buffer * @dest: where to copy to * @src: &substring_t to copy * @size: size of destination buffer * * Description: Copy the characters in &substring_t @src to the * c-style string @dest. Copy no more than @size - 1 characters, plus * the terminating NUL. * * Return: length of @src. */ size_t match_strlcpy(char *dest, const substring_t *src, size_t size) { size_t ret = src->to - src->from; if (size) { size_t len = ret >= size ? size - 1 : ret; memcpy(dest, src->from, len); dest[len] = '\0'; } return ret; } EXPORT_SYMBOL(match_strlcpy); /** * match_strdup - allocate a new string with the contents of a substring_t * @s: &substring_t to copy * * Description: Allocates and returns a string filled with the contents of * the &substring_t @s. The caller is responsible for freeing the returned * string with kfree(). * * Return: the address of the newly allocated NUL-terminated string or * %NULL on error. */ char *match_strdup(const substring_t *s) { return kmemdup_nul(s->from, s->to - s->from, GFP_KERNEL); } EXPORT_SYMBOL(match_strdup);
3 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); consume_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); consume_skb(skb); } }
4 4 4 4 4 4 4 4 16 16 1 1 1 16 16 26 26 16 16 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ /* * Handle hardware traps and faults. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> #include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/vm86.h> #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> #include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> #endif #include <asm/proto.h> DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) return 0; /* * We got #UD, if the text isn't readable we'd have gotten * a different exception. */ return *(unsigned short *)addr == INSN_UD2; } /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. * If it's a UD1, further decode to determine its use: * * FineIBT: d6 udb * FineIBT: f0 75 f9 lock jne . - 6 * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg * * Notable, since __WARN_trap can use all registers, the distinction between * UD1 users is through R/M. */ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; u8 v, reg, rm, rex = 0; int type = BUG_UD1; bool lock = false; if (addr < TASK_SIZE_MAX) return BUG_NONE; for (;;) { v = *(u8 *)(addr++); if (v == INSN_ASOP) continue; if (v == INSN_LOCK) { lock = true; continue; } if ((v & 0xf0) == 0x40) { rex = v; continue; } break; } switch (v) { case 0x70 ... 0x7f: /* Jcc.d8 */ addr += 1; /* d8 */ *len = addr - start; WARN_ON_ONCE(!lock); return BUG_LOCK; case 0xd6: *len = addr - start; return BUG_UDB; case OPCODE_ESCAPE: break; default: return BUG_NONE; } v = *(u8 *)(addr++); if (v == SECOND_BYTE_OPCODE_UD2) { *len = addr - start; return BUG_UD2; } if (v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; *imm = 0; v = *(u8 *)(addr++); /* ModRM */ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) addr++; /* SIB */ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); /* Decode immediate, if present */ switch (X86_MODRM_MOD(v)) { case 0: if (X86_MODRM_RM(v) == 5) addr += 4; /* RIP + disp32 */ if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; if (rm == 2) { /* (%edx) */ *imm = reg; type = BUG_UD1_WARN; } break; case 1: *imm = *(s8 *)addr; addr += 1; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 2: *imm = *(s32 *)addr; addr += 4; if (rm == 0) /* (%eax) */ type = BUG_UD1_UBSAN; break; case 3: break; } /* record instruction length */ *len = addr - start; return type; } static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) { int offset = pt_regs_offset(regs, nr); if (WARN_ON_ONCE(offset < -0)) return 0; return *((unsigned long *)((void *)regs + offset)); } #ifdef HAVE_ARCH_BUG_FORMAT_ARGS DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); EXPORT_STATIC_CALL_TRAMP(WARN_trap); /* * Create a va_list from an exception context. */ void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) { /* * Register save area; populate with function call argument registers */ args->regs[0] = regs->di; args->regs[1] = regs->si; args->regs[2] = regs->dx; args->regs[3] = regs->cx; args->regs[4] = regs->r8; args->regs[5] = regs->r9; /* * From the ABI document: * * @gp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available general purpose * argument register is saved. In case all argument registers have * been exhausted, it is set to the value 48 (6*8). * * @fp_offset - the element holds the offset in bytes from * reg_save_area to the place where the next available floating point * argument is saved. In case all argument registers have been * exhausted, it is set to the value 176 (6*8 + 8*16) * * @overflow_arg_area - this pointer is used to fetch arguments passed * on the stack. It is initialized with the address of the first * argument passed on the stack, if any, and then always updated to * point to the start of the next argument on the stack. * * @reg_save_area - the element points to the start of the register * save area. * * Notably the vararg starts with the second argument and there are no * floating point arguments in the kernel. */ args->args.gp_offset = 1*8; args->args.fp_offset = 6*8 + 8*16; args->args.reg_save_area = &args->regs; args->args.overflow_arg_area = (void *)regs->sp; /* * If the exception came from __WARN_trap, there is a return * address on the stack, skip that. This is why any __WARN_trap() * caller must inhibit tail-call optimization. */ if ((void *)regs->ip == &__WARN_trap) args->args.overflow_arg_area += 8; return &args->args; } #endif /* HAVE_ARCH_BUG_FORMAT */ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ if (trapnr < X86_TRAP_UD) { if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) return 0; } } else if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } else { if (fixup_vdso_exception(regs, trapnr, error_code, 0)) return 0; } /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; return -1; } static void show_signal(struct task_struct *tsk, int signr, const char *type, const char *desc, struct pt_regs *regs, long error_code) { if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } } static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, int sicode, void __user *addr) { struct task_struct *tsk = current; if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) force_sig(signr); else force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr, int sicode, void __user *addr) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); cond_local_irq_disable(regs); } } /* * Posix requires to provide the address of the faulting instruction for * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. * * This address is usually regs->ip, but when an uprobe moved the code out * of line then regs->ip points to the XOL code which would confuse * anything which analyzes the fault address vs. the unmodified binary. If * a trap happened in XOL code then uprobe maps regs->ip back to the * original instruction address. */ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) { return (void __user *)uprobe_get_trap_addr(regs); } DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } DEFINE_IDTENTRY(exc_overflow) { do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else static inline void handle_invalid_op(struct pt_regs *regs) #endif { do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, ILL_ILLOPN, error_get_trap_addr(regs)); } static noinstr bool handle_bug(struct pt_regs *regs) { unsigned long addr = regs->ip; bool handled = false; int ud_type, ud_len; s32 ud_imm; ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; /* * All lies, just get the WARN/BUG out. */ instrumentation_begin(); /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() * is a rare case that uses @regs without passing them to * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); switch (ud_type) { case BUG_UD1_WARN: if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) handled = true; break; case BUG_UD2: if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } fallthrough; case BUG_UDB: case BUG_LOCK: if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; break; } break; case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", report_ubsan_failure(ud_imm), (void *)regs->ip); } break; default: break; } /* * When continuing, and regs->ip hasn't changed, move it to the next * instruction. When not continuing execution, restore the instruction * pointer. */ if (handled) { if (regs->ip == addr) regs->ip += ud_len; } else { regs->ip = addr; } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); return handled; } DEFINE_IDTENTRY_RAW(exc_invalid_op) { irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such * handle it before exception entry to avoid recursive WARN * in case exception entry is the one triggering WARNs. */ if (!user_mode(regs) && handle_bug(regs)) return; state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) { do_error_trap(regs, 0, "coprocessor segment overrun", X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) { do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) { do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) { do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); local_irq_enable(); if (handle_user_split_lock(regs, error_code)) goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); out: local_irq_disable(); } #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info) { const char *name = stack_type_name(info->type); printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", name, (void *)fault_address, info->begin, info->end); die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ panic("%s stack guard hit", name); } #endif /* * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch * between configs triggers objtool warnings. * * This is a temporary hack until we have compiler or plugin support for * annotating noreturns. */ #ifdef CONFIG_X86_ESPFIX64 #define always_true() true #else bool always_true(void); bool __weak always_true(void) { return true; } #endif /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the * SDM's warnings about double faults being unrecoverable, returning works as * expected. Presumably what the SDM actually means is that the CPU may get * the register state wrong on entry, so returning could be a bad idea. * * Various CPU engineers have promised that double faults due to an IRET fault * while the stack is read-only are, in fact, recoverable. * * On x86_32, this is entered through a task gate, and regs are synthesized * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. * * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs * to be read before doing anything else. */ DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; /* * If IRET takes a non-IST fault on the espfix64 stack, then we * end up promoting it to a doublefault. In that case, take * advantage of the fact that we're not using the normal (TSS.sp0) * stack right now. We can write a fake #GP(0) frame at TSS.sp0 * and then modify our own IRET frame so that, when we return, * we land directly at the #GP(0) vector with the stack already * set up according to its expectations. * * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the * ESPFIX64 stack. Copy it to the entry stack. This fills * in gpregs->ss through gpregs->ip. * */ gpregs->ip = p[0]; gpregs->cs = p[1]; gpregs->flags = p[2]; gpregs->sp = p[3]; gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * * We will enter general_protection with kernel GSBASE, * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; #ifdef CONFIG_VMAP_STACK /* * If we overflow the stack into a guard page, the CPU will fail * to deliver #PF and will send #DF instead. Similarly, if we * take any non-IST exception while too close to the bottom of * the stack, the processor will get a page fault while * delivering the exception and will generate a double fault. * * According to the SDM (footnote in 6.15 under "Interrupt 14 - * Page-Fault Exception (#PF): * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a * double fault. * * The logic below has a small possibility of incorrectly diagnosing * some errors as stack overflows. For example, if the IDT or GDT * gets corrupted such that #GP delivery fails due to a bad descriptor * causing #GP and we hit this condition while CR2 coincidentally * points to the stack guard page, we'll think we overflowed the * stack. Given that we're going to panic one way or another * if this happens, this isn't necessarily worth fixing. * * If necessary, we could improve the test by only diagnosing * a stack overflow if the saved RSP points within 47 bytes of * the bottom of the stack: if RSP == tsk_stack + 48 and we * take an exception, the stack is already aligned and there * will be enough room SS, RSP, RFLAGS, CS, RIP, and a * possible error code, so a stack overflow would *not* double * fault. With any less space left, exception delivery could * fail, and, as a practical matter, we've overflowed the * stack even if the actual trigger for the double fault was * something else. */ if (get_stack_guard_info((void *)address, &info)) handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); if (always_true()) panic("Machine halted."); instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) { if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, GP_CANONICAL, GP_LASS_VIOLATION, GP_NULL_POINTER, }; static const char * const kernel_gp_hint_help[] = { [GP_NON_CANONICAL] = "probably for non-canonical address", [GP_CANONICAL] = "maybe for address", [GP_LASS_VIOLATION] = "probably LASS violation for address", [GP_NULL_POINTER] = "kernel NULL pointer dereference", }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure * out whether any part of the access to that address was non-canonical or * across privilege levels. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) return GP_NO_HINT; #ifdef CONFIG_X86_64 /* Operand is in the kernel half */ if (*addr >= ~__VIRTUAL_MASK) return GP_CANONICAL; /* The last byte of the operand is not in the user canonical half */ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) return GP_NON_CANONICAL; /* * A NULL pointer dereference usually causes a #PF. However, it * can result in a #GP when LASS is active. Provide the same * hint in the rare case that the condition is hit without LASS. */ if (*addr < PAGE_SIZE) return GP_NULL_POINTER; /* * Assume that LASS caused the exception, because the address is * canonical and in the user half. */ if (cpu_feature_enabled(X86_FEATURE_LASS)) return GP_LASS_VIOLATION; #endif return GP_CANONICAL; } #define GPFSTR "general protection fault" static bool fixup_iopl_exception(struct pt_regs *regs) { struct thread_struct *t = &current->thread; unsigned char byte; unsigned long ip; if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) return false; if (insn_get_effective_ip(regs, &ip)) return false; if (get_user(byte, (const char __user *)ip)) return false; if (byte != 0xfa && byte != 0xfb) return false; if (!t->iopl_warn && printk_ratelimit()) { pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", current->comm, task_pid_nr(current), ip); print_vma_addr(KERN_CONT " in ", ip); pr_cont("\n"); t->iopl_warn = 1; } regs->ip += 1; return true; } /* * The unprivileged ENQCMD instruction generates #GPs if the * IA32_PASID MSR has not been populated. If possible, populate * the MSR from a PASID previously allocated to the mm. */ static bool try_fixup_enqcmd_gp(void) { #ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* * MSR_IA32_PASID is managed using XSAVE. Directly * writing to the MSR is only possible when fpregs * are valid and the fpstate is not. This is * guaranteed when handling a userspace exception * in *before* interrupts are re-enabled. */ lockdep_assert_irqs_disabled(); /* * Hardware without ENQCMD will not generate * #GPs that can be fixed up here. */ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ if (!mm_valid_pasid(current->mm)) return false; pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. */ if (current->pasid_activated) return false; wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; #else return false; #endif } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str, unsigned long address) { if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; current->thread.trap_nr = trapnr; /* * To be potentially processing a kprobe fault and to trust the result * from kprobe_running(), we have to be non-preemptible. */ if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, trapnr)) return true; return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; } static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str) { current->thread.error_code = error_code; current->thread.trap_nr = trapnr; show_signal(current, SIGSEGV, "", str, regs, error_code); force_sig(SIGSEGV); } DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; unsigned long gp_addr; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); local_irq_disable(); return; } if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); else hint = get_kernel_gp_address(regs, &gp_addr); if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", kernel_gp_hint_help[hint], gp_addr); /* * KASAN is interested only in the non-canonical case, clear it * otherwise. */ if (hint != GP_NON_CANONICAL) gp_addr = 0; die_addr(desc, regs, error_code, gp_addr); exit: cond_local_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) { int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) return true; #endif res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); return res == NOTIFY_STOP; } NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { if (do_int3(regs)) return; cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) { /* * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. */ if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } } #ifdef CONFIG_X86_64 /* * Help handler running on a per-cpu (IST or entry trampoline) stack * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; } #ifdef CONFIG_AMD_MEM_ENCRYPT asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) { unsigned long sp, *stack; struct stack_info info; struct pt_regs *regs_ret; /* * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { sp = current_top_of_stack(); goto sync; } /* * From here on the RSP value is trusted. Now check whether entry * happened from a safe stack. Not safe are the entry or unknown stacks, * use the fall-back stack instead in this case. */ sp = regs->sp; stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: /* * Found a safe stack - switch to it as if the entry didn't happen via * IST stack. The code below only copies pt_regs, the real switch happens * in assembly code. */ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); regs_ret = (struct pt_regs *)sp; *regs_ret = *regs; return regs_ret; } #endif asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { struct pt_regs tmp, *new_stack; /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault * correctly, we want to move our stack frame to where it would * be had we entered directly on the entry stack (rather than * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(new_stack)); return new_stack; } #endif static bool is_sysenter_singlestep(struct pt_regs *regs) { /* * We don't try for precision here. If we're anywhere in the region of * code that can be single-stepped in the SYSENTER entry path, then * assume that this is a useless single-step trap due to SYSENTER * being invoked with TF set. (We don't know in advance exactly * which instructions will be hit because BTF could plausibly * be set.) */ #ifdef CONFIG_X86_32 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < (unsigned long)__end_SYSENTER_singlestep_region - (unsigned long)__begin_SYSENTER_singlestep_region; #elif defined(CONFIG_IA32_EMULATION) return (regs->ip - (unsigned long)entry_SYSENTER_compat) < (unsigned long)__end_entry_SYSENTER_compat - (unsigned long)entry_SYSENTER_compat; #else return false; #endif } static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; get_debugreg(dr6, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ /* * The Intel SDM says: * * Certain debug exceptions may clear bits 0-3 of DR6. * * BLD induced #DB clears DR6.BLD and any other debug * exception doesn't modify DR6.BLD. * * RTM induced #DB clears DR6.RTM and any other debug * exception sets DR6.RTM. * * To avoid confusion in identifying debug exceptions, * debug handlers should set DR6.BLD and DR6.RTM, and * clear other DR6 bits before returning. * * Keep it simple: write DR6 with its architectural reset * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ set_debugreg(DR6_RESERVED, 6); return dr6; } /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore * it is possible to get a watchpoint trap here from inside the kernel. * However, the code in ./ptrace.c has ensured that the user can * only set watchpoints on userspace addresses. Therefore the in-kernel * watchpoint trap can only occur in code which is reading/writing * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though * we clear it here. * * Being careful here means that we don't have to be as careful in a * lot of more complicated places (task switching can be a bit lazy * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) * * May run on IST stack. */ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { /* * Notifiers will clear bits in @dr6 to indicate the event has been * consumed - hw_breakpoint_handler(), single_stop_cont(). * * Notifiers will set bits in @virtual_dr6 to indicate the desire * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) return true; return false; } static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'. * * Since this function is NOKPROBE, and that also applies to * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a * HW_BREAKPOINT_W on our stack) * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. * * For FRED, nested #DB should just work fine. But when a watchpoint or * breakpoint is set in the code path which is executed by #DB handler, * it results in an endless recursion and stack overflow. Thus we stay * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* * If something gets miswired and we end up here for a user mode * #DB, we will malfunction. */ WARN_ON_ONCE(user_mode(regs)); if (test_thread_flag(TIF_BLOCKSTEP)) { /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." but PTRACE_BLOCKSTEP requested * it for userspace, but we just took a kernel #DB, so re-set * BTF. */ unsigned long debugctl; rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* * The kernel doesn't use INT1 */ if (!dr6) goto out; if (notify_debug(regs, &dr6)) goto out; /* * The kernel doesn't use TF single-step outside of: * * - Kprobes, consumed through kprobe_debug_handler() * - KGDB, consumed through notify_debug() * * So if we get here with DR_STEP set, something is wonky. * * A known way to trigger this is through QEMU's GDB stub, * which leaks #DB into the guest and causes IST recursion. */ if (WARN_ON_ONCE(dr6 & DR_STEP)) regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. */ WARN_ON_ONCE(!user_mode(regs)); /* * NB: We can't easily clear DR7 here because * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be * fine. */ irqentry_enter_from_user_mode(regs); instrumentation_begin(); /* * Start the virtual/ptrace DR6 value with just the DR_STEP mask * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. * * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) * even if it is not the result of PTRACE_SINGLESTEP. */ current->thread.virtual_dr6 = (dr6 & DR_STEP); /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ clear_thread_flag(TIF_BLOCKSTEP); /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ icebp = !dr6; if (notify_debug(regs, &dr6)) goto out; /* It's safe to allow irq's after DR6 has been saved */ local_irq_enable(); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); goto out_irq; } /* #DB for bus lock can only be triggered from userspace. */ if (dr6 & DR_BUS_LOCK) handle_bus_lock(regs); /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: local_irq_disable(); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #DB needs to be handled on different stack: User #DB on * current task stack, while kernel #DB on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #DB dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception * entry stub doesn't do stack switch. */ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* CONFIG_X86_FRED */ #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) goto exit; task->thread.error_code = 0; task->thread.trap_nr = trapnr; if (notify_die(DIE_TRAP, str, regs, 0, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, 0); goto exit; } /* * Synchronize the FPU register state to the memory register state * if necessary. This allows the exception handler to inspect it. */ fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: cond_local_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) { math_error(regs, X86_TRAP_MF); } DEFINE_IDTENTRY(exc_simd_coprocessor_error) { if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ if (!static_cpu_has(X86_FEATURE_XMM)) { __exc_general_protection(regs, 0); return; } } math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: * * PROBLEM: If the APIC subsystem is configured in mixed mode with * Virtual Wire mode implemented through the local APIC, an * interrupt vector of 0Fh (Intel reserved encoding) may be * generated by the local APIC (Int 15). This vector may be * generated upon receipt of a spurious interrupt (an interrupt * which is removed before the system receives the INTA sequence) * instead of the programmed 8259 spurious interrupt vector. * * IMPLICATION: The spurious interrupt vector programmed in the * 8259 is normally handled by an operating system's spurious * interrupt handler. However, a vector of 0Fh is unknown to some * operating systems, which would crash if this erratum occurred. * * In theory this could be limited to 32bit, but the handler is not * hurting and who knows which other CPUs suffer from this. */ } static bool handle_xfd_event(struct pt_regs *regs) { u64 xfd_err; int err; if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) return false; local_irq_enable(); err = xfd_enable_feature(xfd_err); switch (err) { case -EPERM: force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); break; case -EFAULT: force_sig(SIGSEGV); break; } local_irq_disable(); return true; } DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); if (handle_xfd_event(regs)) return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); cond_local_irq_disable(regs); return; } #endif /* This should not happen. */ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); } else { /* * Something terrible happened, and we're better off trying * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ die("unexpected #NM exception", regs, 0); } } #ifdef CONFIG_INTEL_TDX_GUEST #define VE_FAULT_STR "VE fault" static void ve_raise_fault(struct pt_regs *regs, long error_code, unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR, address)) { return; } die_addr(VE_FAULT_STR, regs, error_code, address); } /* * Virtualization Exceptions (#VE) are delivered to TDX guests due to * specific guest actions which may happen in either user space or the * kernel: * * * Specific instructions (WBINVD, for example) * * Specific MSR accesses * * Specific CPUID leaf accesses * * Access to specific guest physical addresses * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues * and can be exploited for privilege escalation. Exceptions in the NMI * entry code also cause issues. Returning from the exception handler with * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. * * For these reasons, the kernel avoids #VEs during the syscall gap and * the NMI entry code. Entry code paths do not access TD-shared memory, * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves * that might generate #VE. VMM can remove memory from TD at any point, * but access to unaccepted (or missing) private memory leads to VM * termination, not to #VE. * * Similarly to page faults and breakpoints, #VEs are allowed in NMI * handlers once the kernel is ready to deal with nested NMIs. * * During #VE delivery, all interrupts, including NMIs, are blocked until * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads * the VE info. * * If a guest kernel action which would normally cause a #VE occurs in * the interrupt-disabled region before TDGETVEINFO, a #DF (fault * exception) is delivered to the guest which will result in an oops. * * The entry code has been audited carefully for following these expectations. * Changes in the entry code have to be audited for correctness vs. this * aspect. Similarly to #PF, #VE in these places will expose kernel to * privilege escalation or may lead to random crashes. */ DEFINE_IDTENTRY(exc_virtualization_exception) { struct ve_info ve; /* * NMIs/Machine-checks/Interrupts will be in a disabled state * till TDGETVEINFO TDCALL is executed. This ensures that VE * info cannot be overwritten by a nested #VE. */ tdx_get_ve_info(&ve); cond_local_irq_enable(regs); /* * If tdx_handle_virt_exception() could not process * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } #endif #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { local_irq_enable(); if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); } #endif void __init trap_init(void) { /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_setup_traps(); cpu_init(); }
64 66 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADFS_FS_H #define _ADFS_FS_H #include <uapi/linux/adfs_fs.h> /* * Calculate the boot block checksum on an ADFS drive. Note that this will * appear to be correct if the sector contains all zeros, so also check that * the disk size is non-zero!!! */ static inline int adfs_checkbblk(unsigned char *ptr) { unsigned int result = 0; unsigned char *p = ptr + 511; do { result = (result & 0xff) + (result >> 8); result = result + *--p; } while (p != ptr); return (result & 0xff) != ptr[511]; } #endif
3 214 99 1 405 1 409 408 83 1 83 406 408 405 407 408 406 402 405 83 273 273 273 215 214 212 4 4 6 13 11 9 9 4 8 3 3 2 3 3 2 3 6 3 5 6 2 4 2 2 2 6 6 13 2 1 1 1 1 1 1 1 1 1 2 6 5 5 5 4 3 3 2 3 3 3 6 10 10 10 10 10 8 3 3 3 3 3 3 2 2 3 3 3 3 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 10 10 10 4 4 4 4 7 4 4 1 1 1 1 1 3 2 2 2 2 2 2 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * This file contains sctp stream maniuplation primitives and helpers. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) { struct sctp_association *asoc; struct sctp_chunk *ch, *temp; struct sctp_outq *outq; asoc = container_of(stream, struct sctp_association, stream); outq = &asoc->outqueue; list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { __u16 sid = sctp_chunk_stream_no(ch); if (sid < outcnt) continue; sctp_sched_dequeue_common(outq, ch); /* No need to call dequeue_done here because * the chunks are not scheduled by now. */ /* Mark as failed send. */ sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(ch); } } static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { const struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; sched = sctp_sched_ops_from_stream(stream); sched->free_sid(stream, sid); kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. */ static void sctp_stream_outq_migrate(struct sctp_stream *stream, struct sctp_stream *new, __u16 outcnt) { int i; if (stream->outcnt > outcnt) sctp_stream_shrink_out(stream, outcnt); if (new) { /* Here we actually move the old ext stuff into the new * buffer, because we want to keep it. Then * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { sctp_stream_free_ext(new, i); SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { int ret; if (outcnt <= stream->outcnt) goto out; ret = genradix_prealloc(&stream->out, outcnt, gfp); if (ret) return ret; out: stream->outcnt = outcnt; return 0; } static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { int ret; if (incnt <= stream->incnt) goto out; ret = genradix_prealloc(&stream->in, incnt, gfp); if (ret) return ret; out: stream->incnt = incnt; return 0; } int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc * a new one with new outcnt to save memory if needed. */ if (outcnt == stream->outcnt) goto handle_in; /* Filter out chunks queued on streams that won't exist anymore */ sched->unsched_all(stream); sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); ret = sctp_stream_alloc_out(stream, outcnt, gfp); if (ret) return ret; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; handle_in: sctp_stream_interleave_init(stream); if (!incnt) return 0; return sctp_stream_alloc_in(stream, incnt, gfp); } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); if (ret) { kfree(SCTP_SO(stream, sid)->ext); SCTP_SO(stream, sid)->ext = NULL; } return ret; } void sctp_stream_free(struct sctp_stream *stream) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) sctp_stream_free_ext(stream, i); genradix_free(&stream->out); genradix_free(&stream->in); } void sctp_stream_clear(struct sctp_stream *stream) { int i; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; stream->in = new->in; stream->outcnt = new->outcnt; stream->incnt = new->incnt; sched->sched_all(stream); new->out.tree.root = NULL; new->in.tree.root = NULL; new->outcnt = 0; new->incnt = 0; } static int sctp_send_reconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); return retval; } static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, __u16 str_nums, __be16 *str_list) { struct sctp_association *asoc; __u16 i; asoc = container_of(stream, struct sctp_association, stream); if (!asoc->outqueue.out_qlen) return true; if (!str_nums) return false; for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); if (SCTP_SO(stream, sid)->ext && !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } return true; } int sctp_send_reset_streams(struct sctp_association *asoc, struct sctp_reset_streams *params) { struct sctp_stream *stream = &asoc->stream; __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING; in = params->srs_flags & SCTP_STREAM_RESET_INCOMING; if (!out && !in) goto out; str_nums = params->srs_number_streams; str_list = params->srs_stream_list; if (str_nums) { int param_len = 0; if (out) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->outcnt) goto out; param_len = str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_outreq); } if (in) { for (i = 0; i < str_nums; i++) if (str_list[i] >= stream->incnt) goto out; param_len += str_nums * sizeof(__u16) + sizeof(struct sctp_strreset_inreq); } if (param_len > SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_reconf_chunk)) goto out; } nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); if (!nstr_list) { retval = -ENOMEM; goto out; } for (i = 0; i < str_nums; i++) nstr_list[i] = htons(str_list[i]); if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { kfree(nstr_list); retval = -EAGAIN; goto out; } chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); kfree(nstr_list); if (!chunk) { retval = -ENOMEM; goto out; } if (out) { if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; if (!out) goto out; if (str_nums) for (i = 0; i < str_nums; i++) SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } asoc->strreset_outstanding = out + in; out: return retval; } int sctp_send_reset_assoc(struct sctp_association *asoc) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u16 i; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) return -ENOPROTOOPT; if (asoc->strreset_outstanding) return -EINPROGRESS; if (!sctp_outq_is_empty(&asoc->outqueue)) return -EAGAIN; chunk = sctp_make_strreset_tsnreq(asoc); if (!chunk) return -ENOMEM; /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } asoc->strreset_outstanding = 1; return 0; } int sctp_send_add_streams(struct sctp_association *asoc, struct sctp_add_streams *params) { struct sctp_stream *stream = &asoc->stream; struct sctp_chunk *chunk = NULL; int retval; __u32 outcnt, incnt; __u16 out, in; if (!asoc->peer.reconf_capable || !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { retval = -ENOPROTOOPT; goto out; } if (asoc->strreset_outstanding) { retval = -EINPROGRESS; goto out; } out = params->sas_outstrms; in = params->sas_instrms; outcnt = stream->outcnt + out; incnt = stream->incnt + in; if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM || (!out && !in)) { retval = -EINVAL; goto out; } if (out) { retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); if (retval) goto out; } chunk = sctp_make_strreset_addstrm(asoc, out, in); if (!chunk) { retval = -ENOMEM; goto out; } asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); retval = sctp_send_reconf(asoc, chunk); if (retval) { sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; goto out; } asoc->strreset_outstanding = !!out + !!in; out: return retval; } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; struct sctp_reconf_chunk *hdr; union sctp_params param; if (!chunk) return NULL; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { /* sctp_strreset_tsnreq is actually the basic structure * of all stream reconf params, so it's safe to use it * to access request_seq. */ struct sctp_strreset_tsnreq *req = param.v; if ((!resp_seq || req->request_seq == resp_seq) && (!type || type == req->param_hdr.type)) return param.v; } return NULL; } static void sctp_update_strreset_result(struct sctp_association *asoc, __u32 result) { asoc->strreset_result[1] = asoc->strreset_result[0]; asoc->strreset_result[0] = result; } struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __be16 *str_p = NULL; __u32 request_seq; __u16 i, nums; request_seq = ntohl(outreq->request_seq); if (ntohl(outreq->send_reset_at_tsn) > sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; /* Check strreset_enable after inseq inc, as sender cannot tell * the peer doesn't enable strreset after receiving response with * result denied, as well as to keep consistent with bsd. */ if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); str_p = outreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->incnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, outreq->response_seq, SCTP_PARAM_RESET_IN_REQUEST)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } if (nums) for (i = 0; i < nums; i++) SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; else for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_reset_event(asoc, SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_inreq *inreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq; __u16 i, nums; __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16); str_p = inreq->list_of_streams; for (i = 0; i < nums; i++) { if (ntohs(str_p[i]) >= stream->outcnt) { result = SCTP_STRRESET_ERR_WRONG_SSN; goto out; } } if (!sctp_stream_outq_is_empty(stream, nums, str_p)) { result = SCTP_STRRESET_IN_PROGRESS; asoc->strreset_inseq--; goto err; } chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0); if (!chunk) goto out; if (nums) for (i = 0; i < nums; i++) SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; struct sctp_strreset_tsnreq *tsnreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq; __u16 i; request_seq = ntohl(tsnreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) { next_tsn = asoc->ctsn_ack_point + 1; init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; } goto err; } if (!sctp_outq_is_empty(&asoc->outqueue)) { result = SCTP_STRRESET_IN_PROGRESS; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } /* G4: The same processing as though a FWD-TSN chunk (as defined in * [RFC3758]) with all streams affected and a new cumulative TSN * ACK of the Receiver's Next TSN minus 1 were received MUST be * performed. */ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen); /* G1: Compute an appropriate value for the Receiver's Next TSN -- the * TSN that the peer should use to send the next DATA chunk. The * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); /* G3: The same processing as though a SACK chunk with no gap report * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were * received MUST be performed. */ sctp_outq_free(&asoc->outqueue); /* G2: Compute an appropriate value for the local endpoint's next TSN, * i.e., the next TSN assigned by the receiver of the SSN/TSN reset * chunk. The value SHOULD be the highest TSN sent by the receiver * of the request plus 1. */ next_tsn = asoc->next_tsn; asoc->ctsn_ack_point = next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn, next_tsn, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_tsnresp(asoc, result, request_seq, next_tsn, init_tsn); } struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; __u32 request_seq, incnt; __u16 in, i; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; in = ntohs(addstrm->number_of_streams); incnt = stream->incnt + in; if (!in || incnt > SCTP_MAX_STREAM) goto out; if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } asoc->strreset_outstanding--; asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { struct sctp_transport *t; t = asoc->strreset_chunk->transport; if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } } stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_change_event(asoc, 0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); err: return sctp_make_strreset_resp(asoc, result, request_seq); } struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; goto err; } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { i = asoc->strreset_inseq - request_seq - 1; result = asoc->strreset_result[i]; if (result == SCTP_STRRESET_PERFORMED) return NULL; goto err; } asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; if (asoc->strreset_outstanding) { result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; } out = ntohs(addstrm->number_of_streams); outcnt = stream->outcnt + out; if (!out || outcnt > SCTP_MAX_STREAM) goto out; ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); if (ret) goto out; chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); stream->outcnt = outcnt; result = SCTP_STRRESET_PERFORMED; out: sctp_update_strreset_result(asoc, result); err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp) { struct sctp_stream *stream = &asoc->stream; struct sctp_strreset_resp *resp = param.v; struct sctp_transport *t; __u16 i, nums, flags = 0; struct sctp_paramhdr *req; __u32 result; req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0); if (!req) return NULL; result = ntohl(resp->result); if (result != SCTP_STRRESET_PERFORMED) { /* if in progress, do nothing but retransmit */ if (result == SCTP_STRRESET_IN_PROGRESS) return NULL; else if (result == SCTP_STRRESET_DENIED) flags = SCTP_STREAM_RESET_DENIED; else flags = SCTP_STREAM_RESET_FAILED; } if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { sout = SCTP_SO(stream, ntohs(str_p[i])); sout->mid = 0; sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { sout = SCTP_SO(stream, i); sout->mid = 0; sout->mid_uo = 0; } } } flags |= SCTP_STREAM_RESET_OUTGOING_SSN; for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) return NULL; inreq = (struct sctp_strreset_inreq *)req; str_p = inreq->list_of_streams; nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / sizeof(__u16); flags |= SCTP_STREAM_RESET_INCOMING_SSN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { struct sctp_strreset_resptsn *resptsn; __u32 stsn, rtsn; /* check for resptsn, as sctp_verify_reconf didn't do it*/ if (ntohs(param.p->length) != sizeof(*resptsn)) return NULL; resptsn = (struct sctp_strreset_resptsn *)resp; stsn = ntohl(resptsn->senders_next_tsn); rtsn = ntohl(resptsn->receivers_next_tsn); if (result == SCTP_STRRESET_PERFORMED) { __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( &asoc->peer.tsn_map); LIST_HEAD(temp); asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, stsn, GFP_ATOMIC); /* Clean up sacked and abandoned queues only. As the * out_chunk_list may not be empty, splice it to temp, * then get it back after sctp_outq_free is done. */ list_splice_init(&asoc->outqueue.out_chunk_list, &temp); sctp_outq_free(&asoc->outqueue); list_splice_init(&temp, &asoc->outqueue.out_chunk_list); asoc->next_tsn = rtsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { SCTP_SO(stream, i)->mid = 0; SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) { struct sctp_strreset_addstrm *addstrm; __u16 number; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); number = stream->outcnt - nums; if (result == SCTP_STRRESET_PERFORMED) { for (i = number; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; } else { sctp_stream_shrink_out(stream, number); stream->outcnt = number; } *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, 0, nums, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) { struct sctp_strreset_addstrm *addstrm; /* if the result is performed, it's impossible for addstrm in * request. */ if (result == SCTP_STRRESET_PERFORMED) return NULL; addstrm = (struct sctp_strreset_addstrm *)req; nums = ntohs(addstrm->number_of_streams); *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, nums, 0, GFP_ATOMIC); } asoc->strreset_outstanding--; asoc->strreset_outseq++; /* remove everything for this reconf request */ if (!asoc->strreset_outstanding) { t = asoc->strreset_chunk->transport; if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } return NULL; }
39 9 11 39 39 38 39 38 39 39 39 273 272 273 622 622 623 39 39 827 828 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 // SPDX-License-Identifier: GPL-2.0-or-later /* * Asynchronous Compression operations * * Copyright (c) 2016, Intel Corporation * Authors: Weigang Li <weigang.li@intel.com> * Giovanni Cabiddu <giovanni.cabiddu@intel.com> */ #include <crypto/internal/acompress.h> #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/cpumask.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/scatterlist.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/smp.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/netlink.h> #include "compress.h" struct crypto_scomp; enum { ACOMP_WALK_SLEEP = 1 << 0, ACOMP_WALK_SRC_LINEAR = 1 << 1, ACOMP_WALK_DST_LINEAR = 1 << 2, }; static const struct crypto_type crypto_acomp_type; static void acomp_reqchain_done(void *data, int err); static inline struct acomp_alg *__crypto_acomp_alg(struct crypto_alg *alg) { return container_of(alg, struct acomp_alg, calg.base); } static inline struct acomp_alg *crypto_acomp_alg(struct crypto_acomp *tfm) { return __crypto_acomp_alg(crypto_acomp_tfm(tfm)->__crt_alg); } static int __maybe_unused crypto_acomp_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_acomp racomp; memset(&racomp, 0, sizeof(racomp)); strscpy(racomp.type, "acomp", sizeof(racomp.type)); return nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, sizeof(racomp), &racomp); } static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : acomp\n"); } static void crypto_acomp_exit_tfm(struct crypto_tfm *tfm) { struct crypto_acomp *acomp = __crypto_acomp_tfm(tfm); struct acomp_alg *alg = crypto_acomp_alg(acomp); if (alg->exit) alg->exit(acomp); if (acomp_is_async(acomp)) crypto_free_acomp(crypto_acomp_fb(acomp)); } static int crypto_acomp_init_tfm(struct crypto_tfm *tfm) { struct crypto_acomp *acomp = __crypto_acomp_tfm(tfm); struct acomp_alg *alg = crypto_acomp_alg(acomp); struct crypto_acomp *fb = NULL; int err; if (tfm->__crt_alg->cra_type != &crypto_acomp_type) return crypto_init_scomp_ops_async(tfm); if (acomp_is_async(acomp)) { fb = crypto_alloc_acomp(crypto_acomp_alg_name(acomp), 0, CRYPTO_ALG_ASYNC); if (IS_ERR(fb)) return PTR_ERR(fb); err = -EINVAL; if (crypto_acomp_reqsize(fb) > MAX_SYNC_COMP_REQSIZE) goto out_free_fb; tfm->fb = crypto_acomp_tfm(fb); } acomp->compress = alg->compress; acomp->decompress = alg->decompress; acomp->reqsize = alg->base.cra_reqsize; acomp->base.exit = crypto_acomp_exit_tfm; if (!alg->init) return 0; err = alg->init(acomp); if (err) goto out_free_fb; return 0; out_free_fb: crypto_free_acomp(fb); return err; } static unsigned int crypto_acomp_extsize(struct crypto_alg *alg) { int extsize = crypto_alg_extsize(alg); if (alg->cra_type != &crypto_acomp_type) extsize += sizeof(struct crypto_scomp *); return extsize; } static const struct crypto_type crypto_acomp_type = { .extsize = crypto_acomp_extsize, .init_tfm = crypto_acomp_init_tfm, #ifdef CONFIG_PROC_FS .show = crypto_acomp_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_acomp_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_ACOMPRESS_MASK, .type = CRYPTO_ALG_TYPE_ACOMPRESS, .tfmsize = offsetof(struct crypto_acomp, base), .algsize = offsetof(struct acomp_alg, base), }; struct crypto_acomp *crypto_alloc_acomp(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_acomp_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_acomp); struct crypto_acomp *crypto_alloc_acomp_node(const char *alg_name, u32 type, u32 mask, int node) { return crypto_alloc_tfm_node(alg_name, &crypto_acomp_type, type, mask, node); } EXPORT_SYMBOL_GPL(crypto_alloc_acomp_node); static void acomp_save_req(struct acomp_req *req, crypto_completion_t cplt) { struct acomp_req_chain *state = &req->chain; state->compl = req->base.complete; state->data = req->base.data; req->base.complete = cplt; req->base.data = state; } static void acomp_restore_req(struct acomp_req *req) { struct acomp_req_chain *state = req->base.data; req->base.complete = state->compl; req->base.data = state->data; } static void acomp_reqchain_virt(struct acomp_req *req) { struct acomp_req_chain *state = &req->chain; unsigned int slen = req->slen; unsigned int dlen = req->dlen; if (state->flags & CRYPTO_ACOMP_REQ_SRC_VIRT) acomp_request_set_src_dma(req, state->src, slen); if (state->flags & CRYPTO_ACOMP_REQ_DST_VIRT) acomp_request_set_dst_dma(req, state->dst, dlen); } static void acomp_virt_to_sg(struct acomp_req *req) { struct acomp_req_chain *state = &req->chain; state->flags = req->base.flags & (CRYPTO_ACOMP_REQ_SRC_VIRT | CRYPTO_ACOMP_REQ_DST_VIRT); if (acomp_request_src_isvirt(req)) { unsigned int slen = req->slen; const u8 *svirt = req->svirt; state->src = svirt; sg_init_one(&state->ssg, svirt, slen); acomp_request_set_src_sg(req, &state->ssg, slen); } if (acomp_request_dst_isvirt(req)) { unsigned int dlen = req->dlen; u8 *dvirt = req->dvirt; state->dst = dvirt; sg_init_one(&state->dsg, dvirt, dlen); acomp_request_set_dst_sg(req, &state->dsg, dlen); } } static int acomp_do_nondma(struct acomp_req *req, bool comp) { ACOMP_FBREQ_ON_STACK(fbreq, req); int err; if (comp) err = crypto_acomp_compress(fbreq); else err = crypto_acomp_decompress(fbreq); req->dlen = fbreq->dlen; return err; } static int acomp_do_one_req(struct acomp_req *req, bool comp) { if (acomp_request_isnondma(req)) return acomp_do_nondma(req, comp); acomp_virt_to_sg(req); return comp ? crypto_acomp_reqtfm(req)->compress(req) : crypto_acomp_reqtfm(req)->decompress(req); } static int acomp_reqchain_finish(struct acomp_req *req, int err) { acomp_reqchain_virt(req); acomp_restore_req(req); return err; } static void acomp_reqchain_done(void *data, int err) { struct acomp_req *req = data; crypto_completion_t compl; compl = req->chain.compl; data = req->chain.data; if (err == -EINPROGRESS) goto notify; err = acomp_reqchain_finish(req, err); notify: compl(data, err); } static int acomp_do_req_chain(struct acomp_req *req, bool comp) { int err; acomp_save_req(req, acomp_reqchain_done); err = acomp_do_one_req(req, comp); if (err == -EBUSY || err == -EINPROGRESS) return err; return acomp_reqchain_finish(req, err); } int crypto_acomp_compress(struct acomp_req *req) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); if (acomp_req_on_stack(req) && acomp_is_async(tfm)) return -EAGAIN; if (crypto_acomp_req_virt(tfm) || acomp_request_issg(req)) return crypto_acomp_reqtfm(req)->compress(req); return acomp_do_req_chain(req, true); } EXPORT_SYMBOL_GPL(crypto_acomp_compress); int crypto_acomp_decompress(struct acomp_req *req) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); if (acomp_req_on_stack(req) && acomp_is_async(tfm)) return -EAGAIN; if (crypto_acomp_req_virt(tfm) || acomp_request_issg(req)) return crypto_acomp_reqtfm(req)->decompress(req); return acomp_do_req_chain(req, false); } EXPORT_SYMBOL_GPL(crypto_acomp_decompress); void comp_prepare_alg(struct comp_alg_common *alg) { struct crypto_alg *base = &alg->base; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; } int crypto_register_acomp(struct acomp_alg *alg) { struct crypto_alg *base = &alg->calg.base; comp_prepare_alg(&alg->calg); base->cra_type = &crypto_acomp_type; base->cra_flags |= CRYPTO_ALG_TYPE_ACOMPRESS; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_acomp); void crypto_unregister_acomp(struct acomp_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_acomp); int crypto_register_acomps(struct acomp_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_acomp(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_acomp(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_acomps); void crypto_unregister_acomps(struct acomp_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_acomp(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_acomps); static void acomp_stream_workfn(struct work_struct *work) { struct crypto_acomp_streams *s = container_of(work, struct crypto_acomp_streams, stream_work); struct crypto_acomp_stream __percpu *streams = s->streams; int cpu; for_each_cpu(cpu, &s->stream_want) { struct crypto_acomp_stream *ps; void *ctx; ps = per_cpu_ptr(streams, cpu); if (ps->ctx) continue; ctx = s->alloc_ctx(); if (IS_ERR(ctx)) break; spin_lock_bh(&ps->lock); ps->ctx = ctx; spin_unlock_bh(&ps->lock); cpumask_clear_cpu(cpu, &s->stream_want); } } void crypto_acomp_free_streams(struct crypto_acomp_streams *s) { struct crypto_acomp_stream __percpu *streams = s->streams; void (*free_ctx)(void *); int i; s->streams = NULL; if (!streams) return; cancel_work_sync(&s->stream_work); free_ctx = s->free_ctx; for_each_possible_cpu(i) { struct crypto_acomp_stream *ps = per_cpu_ptr(streams, i); if (!ps->ctx) continue; free_ctx(ps->ctx); } free_percpu(streams); } EXPORT_SYMBOL_GPL(crypto_acomp_free_streams); int crypto_acomp_alloc_streams(struct crypto_acomp_streams *s) { struct crypto_acomp_stream __percpu *streams; struct crypto_acomp_stream *ps; unsigned int i; void *ctx; if (s->streams) return 0; streams = alloc_percpu(struct crypto_acomp_stream); if (!streams) return -ENOMEM; ctx = s->alloc_ctx(); if (IS_ERR(ctx)) { free_percpu(streams); return PTR_ERR(ctx); } i = cpumask_first(cpu_possible_mask); ps = per_cpu_ptr(streams, i); ps->ctx = ctx; for_each_possible_cpu(i) { ps = per_cpu_ptr(streams, i); spin_lock_init(&ps->lock); } s->streams = streams; INIT_WORK(&s->stream_work, acomp_stream_workfn); return 0; } EXPORT_SYMBOL_GPL(crypto_acomp_alloc_streams); struct crypto_acomp_stream *crypto_acomp_lock_stream_bh( struct crypto_acomp_streams *s) __acquires(stream) { struct crypto_acomp_stream __percpu *streams = s->streams; int cpu = raw_smp_processor_id(); struct crypto_acomp_stream *ps; ps = per_cpu_ptr(streams, cpu); spin_lock_bh(&ps->lock); if (likely(ps->ctx)) return ps; spin_unlock(&ps->lock); cpumask_set_cpu(cpu, &s->stream_want); schedule_work(&s->stream_work); ps = per_cpu_ptr(streams, cpumask_first(cpu_possible_mask)); spin_lock(&ps->lock); return ps; } EXPORT_SYMBOL_GPL(crypto_acomp_lock_stream_bh); void acomp_walk_done_src(struct acomp_walk *walk, int used) { walk->slen -= used; if ((walk->flags & ACOMP_WALK_SRC_LINEAR)) scatterwalk_advance(&walk->in, used); else scatterwalk_done_src(&walk->in, used); if ((walk->flags & ACOMP_WALK_SLEEP)) cond_resched(); } EXPORT_SYMBOL_GPL(acomp_walk_done_src); void acomp_walk_done_dst(struct acomp_walk *walk, int used) { walk->dlen -= used; if ((walk->flags & ACOMP_WALK_DST_LINEAR)) scatterwalk_advance(&walk->out, used); else scatterwalk_done_dst(&walk->out, used); if ((walk->flags & ACOMP_WALK_SLEEP)) cond_resched(); } EXPORT_SYMBOL_GPL(acomp_walk_done_dst); int acomp_walk_next_src(struct acomp_walk *walk) { unsigned int slen = walk->slen; unsigned int max = UINT_MAX; if (!preempt_model_preemptible() && (walk->flags & ACOMP_WALK_SLEEP)) max = PAGE_SIZE; if ((walk->flags & ACOMP_WALK_SRC_LINEAR)) { walk->in.__addr = (void *)(((u8 *)walk->in.sg) + walk->in.offset); return min(slen, max); } return slen ? scatterwalk_next(&walk->in, slen) : 0; } EXPORT_SYMBOL_GPL(acomp_walk_next_src); int acomp_walk_next_dst(struct acomp_walk *walk) { unsigned int dlen = walk->dlen; unsigned int max = UINT_MAX; if (!preempt_model_preemptible() && (walk->flags & ACOMP_WALK_SLEEP)) max = PAGE_SIZE; if ((walk->flags & ACOMP_WALK_DST_LINEAR)) { walk->out.__addr = (void *)(((u8 *)walk->out.sg) + walk->out.offset); return min(dlen, max); } return dlen ? scatterwalk_next(&walk->out, dlen) : 0; } EXPORT_SYMBOL_GPL(acomp_walk_next_dst); int acomp_walk_virt(struct acomp_walk *__restrict walk, struct acomp_req *__restrict req, bool atomic) { struct scatterlist *src = req->src; struct scatterlist *dst = req->dst; walk->slen = req->slen; walk->dlen = req->dlen; if (!walk->slen || !walk->dlen) return -EINVAL; walk->flags = 0; if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic) walk->flags |= ACOMP_WALK_SLEEP; if ((req->base.flags & CRYPTO_ACOMP_REQ_SRC_VIRT)) walk->flags |= ACOMP_WALK_SRC_LINEAR; if ((req->base.flags & CRYPTO_ACOMP_REQ_DST_VIRT)) walk->flags |= ACOMP_WALK_DST_LINEAR; if ((walk->flags & ACOMP_WALK_SRC_LINEAR)) { walk->in.sg = (void *)req->svirt; walk->in.offset = 0; } else scatterwalk_start(&walk->in, src); if ((walk->flags & ACOMP_WALK_DST_LINEAR)) { walk->out.sg = (void *)req->dvirt; walk->out.offset = 0; } else scatterwalk_start(&walk->out, dst); return 0; } EXPORT_SYMBOL_GPL(acomp_walk_virt); struct acomp_req *acomp_request_clone(struct acomp_req *req, size_t total, gfp_t gfp) { struct acomp_req *nreq; nreq = container_of(crypto_request_clone(&req->base, total, gfp), struct acomp_req, base); if (nreq == req) return req; if (req->src == &req->chain.ssg) nreq->src = &nreq->chain.ssg; if (req->dst == &req->chain.dsg) nreq->dst = &nreq->chain.dsg; return nreq; } EXPORT_SYMBOL_GPL(acomp_request_clone); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Asynchronous compression type");
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 // SPDX-License-Identifier: GPL-2.0-or-later /* * userdlm.c * * Code which implements the kernel side of a minimal userspace * interface to our DLM. * * Many of the functions here are pared down versions of dlmglue.c * functions. * * Copyright (C) 2003, 2004 Oracle. All rights reserved. */ #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/crc32.h> #include "../ocfs2_lockingver.h" #include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) { return container_of(lksb, struct user_lock_res, l_lksb); } static inline int user_check_wait_flag(struct user_lock_res *lockres, int flag) { int ret; spin_lock(&lockres->l_lock); ret = lockres->l_flags & flag; spin_unlock(&lockres->l_lock); return ret; } static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) { wait_event(lockres->l_event, !user_check_wait_flag(lockres, USER_LOCK_BUSY)); } static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) { wait_event(lockres->l_event, !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); } /* I heart container_of... */ static inline struct ocfs2_cluster_connection * cluster_connection_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); return ip->ip_conn; } static struct inode * user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); return &ip->ip_vfs_inode; } static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) { spin_lock(&lockres->l_lock); lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); } #define user_log_dlm_error(_func, _stat, _lockres) do { \ mlog(ML_ERROR, "Dlm error %d while calling %s on " \ "resource %.*s\n", _stat, _func, \ _lockres->l_namelen, _lockres->l_name); \ } while (0) /* WARNING: This function lives in a world where the only three lock * levels are EX, PR, and NL. It *will* have to be adjusted when more * lock types are added. */ static inline int user_highest_compat_lock_level(int level) { int new_level = DLM_LOCK_EX; if (level == DLM_LOCK_EX) new_level = DLM_LOCK_NL; else if (level == DLM_LOCK_PR) new_level = DLM_LOCK_PR; return new_level; } static void user_ast(struct ocfs2_dlm_lksb *lksb) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); int status; mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, lockres->l_requested); spin_lock(&lockres->l_lock); status = ocfs2_dlm_lock_status(&lockres->l_lksb); if (status) { mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, "Lockres %.*s, requested ivmode. flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= user_highest_compat_lock_level(lockres->l_blocking)) { lockres->l_blocking = DLM_LOCK_NL; lockres->l_flags &= ~USER_LOCK_BLOCKED; } } lockres->l_level = lockres->l_requested; lockres->l_requested = DLM_LOCK_IV; lockres->l_flags |= USER_LOCK_ATTACHED; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) { struct inode *inode; inode = user_dlm_inode_from_user_lockres(lockres); if (!igrab(inode)) BUG(); } static void user_dlm_unblock_lock(struct work_struct *work); static void __user_dlm_queue_lockres(struct user_lock_res *lockres) { if (!(lockres->l_flags & USER_LOCK_QUEUED)) { user_dlm_grab_inode_ref(lockres); INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); queue_work(user_dlm_worker, &lockres->l_work); lockres->l_flags |= USER_LOCK_QUEUED; } } static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) { int queue = 0; if (!(lockres->l_flags & USER_LOCK_BLOCKED)) return; switch (lockres->l_blocking) { case DLM_LOCK_EX: if (!lockres->l_ex_holders && !lockres->l_ro_holders) queue = 1; break; case DLM_LOCK_PR: if (!lockres->l_ex_holders) queue = 1; break; default: BUG(); } if (queue) __user_dlm_queue_lockres(lockres); } static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", lockres->l_namelen, lockres->l_name, level, lockres->l_level); spin_lock(&lockres->l_lock); lockres->l_flags |= USER_LOCK_BLOCKED; if (level > lockres->l_blocking) lockres->l_blocking = level; __user_dlm_queue_lockres(lockres); spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); if (status) mlog(ML_ERROR, "dlm returns status %d\n", status); spin_lock(&lockres->l_lock); /* The teardown flag gets set early during the unlock process, * so test the cancel flag to make sure that this ast isn't * for a concurrent cancel. */ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = DLM_LOCK_IV; } else if (status == DLM_CANCELGRANT) { /* We tried to cancel a convert request, but it was * already granted. Don't clear the busy flag - the * ast should've done this already. */ BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); lockres->l_flags &= ~USER_LOCK_IN_CANCEL; goto out_noclear; } else { BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); /* Cancel succeeded, we want to re-queue */ lockres->l_requested = DLM_LOCK_IV; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ if (lockres->l_flags & USER_LOCK_BLOCKED) __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } /* * This is the userdlmfs locking protocol version. * * See fs/ocfs2/dlmglue.c for more details on locking versions. */ static struct ocfs2_locking_protocol user_dlm_lproto = { .lp_max_version = { .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, }, .lp_lock_ast = user_ast, .lp_blocking_ast = user_bast, .lp_unlock_ast = user_unlock_ast, }; static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) { struct inode *inode; inode = user_dlm_inode_from_user_lockres(lockres); iput(inode); } static void user_dlm_unblock_lock(struct work_struct *work) { int new_level, status; struct user_lock_res *lockres = container_of(work, struct user_lock_res, l_work); struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), "Lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* notice that we don't clear USER_LOCK_BLOCKED here. If it's * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; /* It's valid to get here and no longer be blocked - if we get * several basts in a row, we might be queued by the first * one, the unblock thread might run and clear the queued * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { if (lockres->l_flags & USER_LOCK_IN_CANCEL) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } lockres->l_flags |= USER_LOCK_IN_CANCEL; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_CANCEL); if (status) user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto drop_ref; } /* If there are still incompat holders, we can exit safely * without worrying about re-queueing this lock as that will * happen on the last call to user_cluster_unlock. */ if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders, lockres->l_ro_holders); goto drop_ref; } if ((lockres->l_blocking == DLM_LOCK_PR) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders); goto drop_ref; } /* yay, we can downconvert now. */ new_level = user_highest_compat_lock_level(lockres->l_blocking); lockres->l_requested = new_level; lockres->l_flags |= USER_LOCK_BUSY; mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, DLM_LKF_CONVERT|DLM_LKF_VALBLK, lockres->l_name, lockres->l_namelen); if (status) { user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); } drop_ref: user_dlm_drop_inode_ref(lockres); } static inline void user_dlm_inc_holders(struct user_lock_res *lockres, int level) { switch(level) { case DLM_LOCK_EX: lockres->l_ex_holders++; break; case DLM_LOCK_PR: lockres->l_ro_holders++; break; default: BUG(); } } /* predict what lock level we'll be dropping down to on behalf * of another node, and return true if the currently wanted * level will be compatible with it. */ static inline int user_may_continue_on_blocked_lock(struct user_lock_res *lockres, int wanted) { BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); return wanted <= user_highest_compat_lock_level(lockres->l_blocking); } int user_dlm_cluster_lock(struct user_lock_res *lockres, int level, int lkm_flags) { int status, local_flags; struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); if (level != DLM_LOCK_EX && level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); status = -EINVAL; goto bail; } mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", lockres->l_namelen, lockres->l_name, level, lkm_flags); again: if (signal_pending(current)) { status = -ERESTARTSYS; goto bail; } spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); status = -EAGAIN; goto bail; } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, * we'll get caught below. */ if ((lockres->l_flags & USER_LOCK_BUSY) && (level > lockres->l_level)) { /* is someone sitting in dlm_lock? If so, wait on * them. */ spin_unlock(&lockres->l_lock); user_wait_on_busy_lock(lockres); goto again; } if ((lockres->l_flags & USER_LOCK_BLOCKED) && (!user_may_continue_on_blocked_lock(lockres, level))) { /* is the lock is currently blocked on behalf of * another node */ spin_unlock(&lockres->l_lock); user_wait_on_blocked_lock(lockres); goto again; } if (level > lockres->l_level) { local_flags = lkm_flags | DLM_LKF_VALBLK; if (lockres->l_level != DLM_LOCK_IV) local_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); BUG_ON(level == DLM_LOCK_IV); BUG_ON(level == DLM_LOCK_NL); /* call dlm_lock to upgrade lock now */ status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, local_flags, lockres->l_name, lockres->l_namelen); if (status) { if ((lkm_flags & DLM_LKF_NOQUEUE) && (status != -EAGAIN)) user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); goto bail; } user_wait_on_busy_lock(lockres); goto again; } user_dlm_inc_holders(lockres, level); spin_unlock(&lockres->l_lock); status = 0; bail: return status; } static inline void user_dlm_dec_holders(struct user_lock_res *lockres, int level) { switch(level) { case DLM_LOCK_EX: BUG_ON(!lockres->l_ex_holders); lockres->l_ex_holders--; break; case DLM_LOCK_PR: BUG_ON(!lockres->l_ro_holders); lockres->l_ro_holders--; break; default: BUG(); } } void user_dlm_cluster_unlock(struct user_lock_res *lockres, int level) { if (level != DLM_LOCK_EX && level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); return; } spin_lock(&lockres->l_lock); user_dlm_dec_holders(lockres, level); __user_dlm_cond_queue_lockres(lockres); spin_unlock(&lockres->l_lock); } void user_dlm_write_lvb(struct inode *inode, const char *val, unsigned int len) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; BUG_ON(len > DLM_LVB_LEN); spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_EX); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(lvb, val, len); spin_unlock(&lockres->l_lock); } bool user_dlm_read_lvb(struct inode *inode, char *val) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; bool ret = true; spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_PR); if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(val, lvb, DLM_LVB_LEN); } else ret = false; spin_unlock(&lockres->l_lock); return ret; } void user_dlm_lock_res_init(struct user_lock_res *lockres, struct dentry *dentry) { memset(lockres, 0, sizeof(*lockres)); spin_lock_init(&lockres->l_lock); init_waitqueue_head(&lockres->l_event); lockres->l_level = DLM_LOCK_IV; lockres->l_requested = DLM_LOCK_IV; lockres->l_blocking = DLM_LOCK_IV; /* should have been checked before getting here. */ BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); memcpy(lockres->l_name, dentry->d_name.name, dentry->d_name.len); lockres->l_namelen = dentry->d_name.len; } int user_dlm_destroy_lock(struct user_lock_res *lockres) { int status = -EBUSY; struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); user_wait_on_busy_lock(lockres); spin_lock(&lockres->l_lock); } if (lockres->l_ro_holders || lockres->l_ex_holders) { lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { /* * lock is never requested, leave USER_LOCK_IN_TEARDOWN set * to avoid new lock request coming in. */ spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { spin_lock(&lockres->l_lock); lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } user_wait_on_busy_lock(lockres); status = 0; bail: return status; } static void user_dlm_recovery_handler_noop(int node_num, void *recovery_data) { /* We ignore recovery events */ return; } void user_dlm_set_locking_protocol(void) { ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); } struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name) { int rc; struct ocfs2_cluster_connection *conn; rc = ocfs2_cluster_connect_agnostic(name->name, name->len, &user_dlm_lproto, user_dlm_recovery_handler_noop, NULL, &conn); if (rc) mlog_errno(rc); return rc ? ERR_PTR(rc) : conn; } void user_dlm_unregister(struct ocfs2_cluster_connection *conn) { ocfs2_cluster_disconnect(conn, 0); }
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Common values for SM3 algorithm * * Copyright (C) 2017 ARM Limited or its affiliates. * Copyright (C) 2017 Gilad Ben-Yossef <gilad@benyossef.com> * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ #ifndef _CRYPTO_SM3_H #define _CRYPTO_SM3_H #include <linux/types.h> #define SM3_DIGEST_SIZE 32 #define SM3_BLOCK_SIZE 64 #define SM3_STATE_SIZE 40 #define SM3_T1 0x79CC4519 #define SM3_T2 0x7A879D8A #define SM3_IVA 0x7380166f #define SM3_IVB 0x4914b2b9 #define SM3_IVC 0x172442d7 #define SM3_IVD 0xda8a0600 #define SM3_IVE 0xa96f30bc #define SM3_IVF 0x163138aa #define SM3_IVG 0xe38dee4d #define SM3_IVH 0xb0fb0e4e extern const u8 sm3_zero_message_hash[SM3_DIGEST_SIZE]; struct sm3_state { u32 state[SM3_DIGEST_SIZE / 4]; u64 count; u8 buffer[SM3_BLOCK_SIZE]; }; /* * Stand-alone implementation of the SM3 algorithm. It is designed to * have as little dependencies as possible so it can be used in the * kexec_file purgatory. In other cases you should generally use the * hash APIs from include/crypto/hash.h. Especially when hashing large * amounts of data as those APIs may be hw-accelerated. * * For details see lib/crypto/sm3.c */ static inline void sm3_init(struct sm3_state *sctx) { sctx->state[0] = SM3_IVA; sctx->state[1] = SM3_IVB; sctx->state[2] = SM3_IVC; sctx->state[3] = SM3_IVD; sctx->state[4] = SM3_IVE; sctx->state[5] = SM3_IVF; sctx->state[6] = SM3_IVG; sctx->state[7] = SM3_IVH; sctx->count = 0; } void sm3_block_generic(struct sm3_state *sctx, u8 const *data, int blocks); #endif
14539 8021 47 15549 11759 11473 15843 14813 15901 900 12663 125 125 125 900 9720 11859 1344 2782 1181 9 24 24 4 13 7854 11858 36 730 29 29 29 25 16 29 26 548 1036 15 23 77 6 4 12 363 5 5 5385 5437 5440 2664 4725 4588 4187 4100 215 215 215 215 128 9 9 9 7 9 8 128 128 128 3 4130 2860 7908 7907 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif #define pmd_folio(pmd) page_folio(pmd_page(pmd)) /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef kernel_pte_init static inline void kernel_pte_init(void *addr) { } #define kernel_pte_init kernel_pte_init #endif #ifndef pmd_init static inline void pmd_init(void *addr) { } #define pmd_init pmd_init #endif #ifndef pud_init static inline void pud_init(void *addr) { } #define pud_init pud_init #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. * * In the general case, no lock is guaranteed to be held between entry and exit * of the lazy mode. So the implementation must assume preemption may be enabled * and cpu migration is possible; it must take steps to be robust against this. * (In practice, for user PTE updates, the appropriate page table lock(s) are * held, but for kernel PTE updates, no lock is held). Nesting is not permitted * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_leave_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint /** * pte_batch_hint - Number of pages that can be added to batch without scanning. * @ptep: Page table pointer for the entry. * @pte: Page table entry. * * Some architectures know that a set of contiguous ptes all map the same * contiguous memory with the same permissions. In this case, it can provide a * hint to aid pte batching without the core code needing to scan every pte. * * An architecture implementation may ignore the PTE accessed state. Further, * the dirty state must apply atomically to all the PTEs described by the hint. * * May be overridden by the architecture, else pte_batch_hint is always 1. */ static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { return 1; } #endif #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * When nr==1, initial state of pte may be present or not present, and new state * may be present or not present. When nr>1, initial state of all ptes must be * not present, and new state must be present. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef exec_folio_order /* * Returns preferred minimum folio order for executable file-backed memory. Must * be in range [0, PMD_ORDER). Default to order-0. */ static inline unsigned int exec_folio_order(void) { return 0; } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef arch_check_zapped_pud static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, pte); return pte; } #endif #ifndef clear_young_dirty_ptes /** * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the * same folio as old/clean. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to mark old/clean. * @flags: Flags to modify the PTE batch semantics. * * May be overridden by the architecture; otherwise, implemented by * get_and_clear/modify/set for each pte in the range. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { if (flags == CYDP_CLEAR_YOUNG) ptep_test_and_clear_young(vma, addr, ptep); else { pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, addr, ptep, pte); } if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, addr, ptep); /* * No need for ptep_get_and_clear(): page table check doesn't care about * any bits that could have been set by HW concurrently. */ page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif #ifndef get_and_clear_full_ptes /** * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the * returned PTE. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = ptep_get_and_clear_full(mm, addr, ptep, full); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * get_and_clear_ptes - Clear present PTEs that map consecutive pages of * the same folio, collecting dirty/accessed bits. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of get_and_clear_full_ptes() if it is known that we don't * need to clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); } #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_get_and_clear_full(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { ptep_get_and_clear_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /** * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * * Use this instead of clear_full_ptes() if it is known that we don't need to * clear the full mm, which is mostly the case. * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { clear_full_ptes(mm, addr, ptep, nr, 0); } /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef update_mmu_tlb_range static inline void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { } #endif static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { update_mmu_tlb_range(vma, address, ptep, 1); } /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef clear_not_present_full_ptes /** * clear_not_present_full_ptes - Clear multiple not present PTEs which are * consecutive in the pgtable. * @mm: Address space the ptes represent. * @addr: Address of the first pte. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to clear. * @full: Whether we are clearing a full mm. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over pte_clear_not_present_full(). * * Context: The caller holds the page table lock. The PTEs are all not present. * The PTEs are all in the same PMD. */ static inline void clear_not_present_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { pte_clear_not_present_full(mm, addr, ptep, full); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same * folio. * @mm: Address space the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries to write-protect. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_set_wrprotect(). * * Note that PTE bits in the PTE range besides the PFN can differ. For example, * some PTEs might be write-protected. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { for (;;) { ptep_set_wrprotect(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { } #else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte, int nr) { for (int i = 0; i < nr; i++) { arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, pte_advance_pfn(pte, i), pte_advance_pfn(oldpte, i)); } } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif #ifndef flush_tlb_fix_spurious_fault_pmd #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. The pte returned from ptep_modify_prot_start() may * additionally have young and/or dirty bits set where previously they were not, * so the updated pte may have these additional changes. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ /** * modify_prot_start_ptes - Start a pte protection read-modify-write transaction * over a batch of ptes, which protects against asynchronous hardware * modifications to the ptes. The intention is not to prevent the hardware from * making pte updates, but to prevent any updates it may make from being lost. * Please see the comment above ptep_modify_prot_start() for full description. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte * in the batch. * * Note that PTE bits in the PTE batch besides the PFN can differ. * * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. All other PTE bits must be identical for * all PTEs in the batch except for young and dirty bits. The PTEs are all in * the same PMD. */ #ifndef modify_prot_start_ptes static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { pte_t pte, tmp_pte; pte = ptep_modify_prot_start(vma, addr, ptep); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = ptep_modify_prot_start(vma, addr, ptep); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #endif /** * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any * hardware-controlled bits in the PTE unmodified. * * @vma: The virtual memory area the pages are mapped into. * @addr: Address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @old_pte: Old page table entry (for the first entry) which is now cleared. * @pte: New page table entry to be set. * @nr: Number of entries. * * May be overridden by the architecture; otherwise, implemented as a simple * loop over ptep_modify_prot_commit(). * * Context: The caller holds the page table lock. The PTEs are all in the same * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by * ptep_modify_prot_start() may additionally have young and/or dirty bits set * where previously they were not, so the updated ptes may have these * additional changes. */ #ifndef modify_prot_commit_ptes static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) { int i; for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); /* Advance PFN only, set same prot */ old_pte = pte_next_pfn(old_pte); pte = pte_next_pfn(pte); } } #endif /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc, ioremap and page table update code know when * arch_sync_kernel_mappings() needs to be called. */ #ifndef ARCH_PAGE_TABLE_SYNC_MASK #define ARCH_PAGE_TABLE_SYNC_MASK 0 #endif /* * There is no default implementation for arch_sync_kernel_mappings(). It is * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK * is 0. */ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif /* * Some platforms can customize the PTE soft-dirty bit making it unavailable * even if the architecture provides the resource. * Adding this API allows architectures to add their own checks for the * devices on which the kernel is running. * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY * is part of this macro. */ #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot) { return 0; } static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { } #else /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to modify * * Lookup the cachemode for the pfn range starting at @pfn with the size * @size and store it in @prot, leaving other data in @prot unchanged. * * This allows for a hardware implementation to have fine-grained control of * memory cache behavior at page level granularity. Without a hardware * implementation, this function does nothing. * * Currently there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * This function can fail if the pfn range spans pfns that require differing * cachemodes. If the pfn range was previously verified to have a single * cachemode, it is sufficient to query only a single pfn. The assumption is * that this is the case for drivers using the vmf_insert_pfn*() interface. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_track - track a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * @prot: the pgprot to track * * Requested the pfn range to be 'tracked' by a hardware implementation and * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). * * This allows for fine-grained control of memory cache behaviour at page * level granularity. Tracking memory this way is persisted across VMA splits * (VMA merging does not apply for VM_PFNMAP). * * Currently, there is only one implementation for this - x86 Page Attribute * Table (PAT). See Documentation/arch/x86/pat.rst for more details. * * Returns 0 on success and -EINVAL on error. */ int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); /** * pfnmap_untrack - untrack a pfn range * @pfn: the start of the pfn range * @size: the size of the pfn range in bytes * * Untrack a pfn range previously tracked through pfnmap_track(). */ void pfnmap_untrack(unsigned long pfn, unsigned long size); #endif /** * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn * @pfn: the pfn * @prot: the pgprot to modify * * Lookup the cachemode for @pfn and store it in @prot, leaving other * data in @prot unchanged. * * See pfnmap_setup_cachemode() for details. */ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() and p*d_populate_kernel() * functions in the generic vmalloc, ioremap and page table update code * to track at which page-table levels entries have been modified. * Based on that the code can better decide when page table changes need * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; enum pgtable_level { PGTABLE_LEVEL_PTE = 0, PGTABLE_LEVEL_PMD, PGTABLE_LEVEL_PUD, PGTABLE_LEVEL_P4D, PGTABLE_LEVEL_PGD, }; static inline const char *pgtable_level_to_str(enum pgtable_level level) { switch (level) { case PGTABLE_LEVEL_PTE: return "pte"; case PGTABLE_LEVEL_PMD: return "pmd"; case PGTABLE_LEVEL_PUD: return "pud"; case PGTABLE_LEVEL_P4D: return "p4d"; case PGTABLE_LEVEL_PGD: return "pgd"; default: return "unknown"; } } #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * pXd_leaf() is the API to check whether a pgtable entry is a huge page * mapping. It should work globally across all archs, without any * dependency on CONFIG_* options. For architectures that do not support * huge mappings on specific levels, below fallbacks will be used. * * A leaf pgtable entry should always imply the following: * * - It is a "present" entry. IOW, before using this API, please check it * with pXd_present() first. NOTE: it may not always mean the "present * bit" is set. For example, PROT_NONE entries are always "present". * * - It should _never_ be a swap entry of any type. Above "present" check * should have guarded this, but let's be crystal clear on this. * * - It should contain a huge PFN, which points to a huge page larger than * PAGE_SIZE of the platform. The PFN format isn't important here. * * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge() * or hugetlb mappings). */ #ifndef pgd_leaf #define pgd_leaf(x) false #endif #ifndef p4d_leaf #define p4d_leaf(x) false #endif #ifndef pud_leaf #define pud_leaf(x) false #endif #ifndef pmd_leaf #define pmd_leaf(x) false #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif #define __pte_leaf_size(x,y) pte_leaf_size(y) #endif /* * We always define pmd_pfn for all archs as it's used in lots of generic * code. Now it happens too for pud_pfn (and can happen for larger * mappings too in the future; we're not there yet). Instead of defining * it for all archs (like pmd_pfn), provide a fallback. * * Note that returning 0 here means any arch that didn't define this can * get severely wrong when it hits a real pud leaf. It's arch's * responsibility to properly define it when a huge pud is possible. */ #ifndef pud_pfn #define pud_pfn(x) 0 #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #ifndef pte_pgprot #define pte_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pmd_pgprot #define pmd_pgprot(x) ((pgprot_t) {0}) #endif #ifndef pud_pgprot #define pud_pgprot(x) ((pgprot_t) {0}) #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */
116 8 5 1 1 1 1 1 1 1 9 110 2 3 3 3 19 6 14 5 3 3 3 5 14 14 8 4 6 2 4 104 8 101 5 101 105 103 9 8 4 103 8 105 104 105 105 104 9 9 61 4 3 61 4 61 2 2 1 12 7 12 10 7 7 4 7 3 12 11 11 7 7 7 3 2 2 3 12 1 12 12 12 7 12 9 7 9 9 7 9 12 12 9 7 12 1 12 41 5 5 2 2 42 41 41 37 37 4 4 3 3 3 59 59 7 7 6 59 7 7 7 7 7 7 59 59 59 59 9 9 8 2 8 7 9 59 58 4 2 2 1 7 7 7 4 7 4 6 7 6 6 1 7 7 7 7 6 4 4 4 59 59 59 7 59 7 7 59 7 6 4 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/pnode.c * * (C) Copyright IBM Corporation 2005. * Author : Ram Pai (linuxram@us.ibm.com) */ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <uapi/linux/mount.h> #include "internal.h" #include "pnode.h" /* return the next shared peer mount of @p */ static inline struct mount *next_peer(struct mount *p) { return list_entry(p->mnt_share.next, struct mount, mnt_share); } static inline struct mount *first_slave(struct mount *p) { return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } /* locks: namespace_shared && is_mounted(mnt) */ static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) { struct mount *m = mnt; do { /* Check the namespace first for optimization */ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); } while (m != mnt); return NULL; } /* * Get ID of closest dominating peer group having a representative * under the given root. * * locks: namespace_shared */ int get_dominating_id(struct mount *mnt, const struct path *root) { struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } return 0; } static inline bool will_be_unmounted(struct mount *m) { return m->mnt.mnt_flags & MNT_UMOUNT; } static void transfer_propagation(struct mount *mnt, struct mount *to) { struct hlist_node *p = NULL, *n; struct mount *m; hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) { m->mnt_master = to; if (!to) hlist_del_init(&m->mnt_slave); else p = &m->mnt_slave; } if (p) hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list); } /* * EXCL[namespace_sem] */ void change_mnt_propagation(struct mount *mnt, int type) { struct mount *m = mnt->mnt_master; if (type == MS_SHARED) { set_mnt_shared(mnt); return; } if (IS_MNT_SHARED(mnt)) { if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { m = next_peer(mnt); list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; } CLEAR_MNT_SHARED(mnt); transfer_propagation(mnt, m); } hlist_del_init(&mnt->mnt_slave); if (type == MS_SLAVE) { mnt->mnt_master = m; if (m) hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list); } else { mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) mnt->mnt_t_flags |= T_UNBINDABLE; else mnt->mnt_t_flags &= ~T_UNBINDABLE; } } static struct mount *trace_transfers(struct mount *m) { while (1) { struct mount *next = next_peer(m); if (next != m) { list_del_init(&m->mnt_share); m->mnt_group_id = 0; m->mnt_master = next; } else { if (IS_MNT_SHARED(m)) mnt_release_group_id(m); next = m->mnt_master; } hlist_del_init(&m->mnt_slave); CLEAR_MNT_SHARED(m); SET_MNT_MARK(m); if (!next || !will_be_unmounted(next)) return next; if (IS_MNT_MARKED(next)) return next->mnt_master; m = next; } } static void set_destinations(struct mount *m, struct mount *master) { struct mount *next; while ((next = m->mnt_master) != master) { m->mnt_master = master; m = next; } } void bulk_make_private(struct list_head *set) { struct mount *m; list_for_each_entry(m, set, mnt_list) if (!IS_MNT_MARKED(m)) set_destinations(m, trace_transfers(m)); list_for_each_entry(m, set, mnt_list) { transfer_propagation(m, m->mnt_master); m->mnt_master = NULL; CLEAR_MNT_MARK(m); } } static struct mount *__propagation_next(struct mount *m, struct mount *origin) { while (1) { struct mount *master = m->mnt_master; if (master == origin->mnt_master) { struct mount *next = next_peer(m); return (next == origin) ? NULL : next; } else if (m->mnt_slave.next) return next_slave(m); /* back at master */ m = master; } } /* * get the next mount in the propagation tree. * @m: the mount seen last * @origin: the original mount from where the tree walk initiated * * Note that peer groups form contiguous segments of slave lists. * We rely on that in get_source() to be able to find out if * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); return __propagation_next(m, origin); } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* * Advance m past everything that gets propagation from it. */ struct mount *p = __propagation_next(m, origin); while (p && peers(m, p)) p = __propagation_next(p, origin); return p; } static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { while (1) { struct mount *next; if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { if (next == origin) return NULL; } else if (m->mnt_slave.next != &next->mnt_slave) break; m = next; } /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; if (m->mnt_slave.next) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) break; if (master->mnt_slave.next == &m->mnt_slave) break; m = master; } if (m == origin) return NULL; } } static bool need_secondary(struct mount *m, struct mountpoint *dest_mp) { /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) return false; /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return false; /* skip if m is in the anon_ns */ if (is_anon_ns(m->mnt_ns)) return false; return true; } static struct mount *find_master(struct mount *m, struct mount *last_copy, struct mount *original) { struct mount *p; // ascend until there's a copy for something with the same master for (;;) { p = m->mnt_master; if (!p || IS_MNT_MARKED(p)) break; m = p; } while (!peers(last_copy, original)) { struct mount *parent = last_copy->mnt_parent; if (parent->mnt_master == p) { if (!peers(parent, m)) last_copy = last_copy->mnt_master; break; } last_copy = last_copy->mnt_master; } return last_copy; } /** * propagate_mnt() - create secondary copies for tree attachment * @dest_mnt: destination mount. * @dest_mp: destination mountpoint. * @source_mnt: source mount. * @tree_list: list of secondaries to be attached. * * Create secondary copies for attaching a tree with root @source_mnt * at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts * into a propagation graph. Set mountpoints for all secondaries, * link their roots into @tree_list via ->mnt_hash. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { struct mount *m, *n, *copy, *this; int err = 0, type; if (dest_mnt->mnt_master) SET_MNT_MARK(dest_mnt->mnt_master); /* iterate over peer groups, depth first */ for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) { if (m == dest_mnt) { // have one for dest_mnt itself copy = source_mnt; type = CL_MAKE_SHARED; n = next_peer(m); if (n == m) continue; } else { type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) type |= CL_MAKE_SHARED; n = m; } do { if (!need_secondary(n, dest_mp)) continue; if (type & CL_SLAVE) // first in this peer group copy = find_master(n, copy, source_mnt); this = copy_tree(copy, copy->mnt.mnt_root, type); if (IS_ERR(this)) { err = PTR_ERR(this); break; } scoped_guard(mount_locked_reader) mnt_set_mountpoint(n, dest_mp, this); if (n->mnt_master) SET_MNT_MARK(n->mnt_master); copy = this; hlist_add_head(&this->mnt_hash, tree_list); err = count_mounts(n->mnt_ns, this); if (err) break; type = CL_MAKE_SHARED; } while ((n = next_peer(n)) != m); } hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; if (m->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } if (dest_mnt->mnt_master) CLEAR_MNT_MARK(dest_mnt->mnt_master); return err; } /* * return true if the refcount is greater than count */ static inline int do_refcount_check(struct mount *mnt, int count) { return mnt_get_count(mnt) > count; } /** * propagation_would_overmount - check whether propagation from @from * would overmount @to * @from: shared mount * @to: mount to check * @mp: future mountpoint of @to on @from * * If @from propagates mounts to @to, @from and @to must either be peers * or one of the masters in the hierarchy of masters of @to must be a * peer of @from. * * If the root of the @to mount is equal to the future mountpoint @mp of * the @to mount on @from then @to will be overmounted by whatever is * propagated to it. * * Context: This function expects namespace_lock() to be held and that * @mp is stable. * Return: If @from overmounts @to, true is returned, false if not. */ bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp) { if (!IS_MNT_SHARED(from)) return false; if (to->mnt.mnt_root != mp->m_dentry) return false; for (const struct mount *m = to; m; m = m->mnt_master) { if (peers(from, m)) return true; } return false; } /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount * NOTE: unmounting 'mnt' would naturally propagate to all * other mounts its parent propagates to. * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. * * vfsmount lock must be held for write */ int propagate_mount_busy(struct mount *mnt, int refcnt) { struct mount *parent = mnt->mnt_parent; /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other * mounts */ if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; if (mnt == parent) return 0; for (struct mount *m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { struct list_head *head; struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; head = &child->mnt_mounts; if (!list_empty(head)) { /* * a mount that covers child completely wouldn't prevent * it being pulled out; any other would. */ if (!list_is_singular(head) || !child->overmount) continue; } if (do_refcount_check(child, 1)) return 1; } return 0; } /* * Clear MNT_LOCKED when it can be shown to be safe. * * mount_lock lock must be held for write */ void propagate_mount_unlock(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m, *child; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } static inline bool is_candidate(struct mount *m) { return m->mnt_t_flags & T_UMOUNT_CANDIDATE; } static void umount_one(struct mount *m, struct list_head *to_umount) { m->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&m->mnt_child); move_from_ns(m); list_add_tail(&m->mnt_list, to_umount); } static void remove_from_candidate_list(struct mount *m) { m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE); list_del_init(&m->mnt_list); } static void gather_candidates(struct list_head *set, struct list_head *candidates) { struct mount *m, *p, *q; list_for_each_entry(m, set, mnt_list) { if (is_candidate(m)) continue; m->mnt_t_flags |= T_UMOUNT_CANDIDATE; p = m->mnt_parent; q = propagation_next(p, p); while (q) { struct mount *child = __lookup_mnt(&q->mnt, m->mnt_mountpoint); if (child) { /* * We might've already run into this one. That * must've happened on earlier iteration of the * outer loop; in that case we can skip those * parents that get propagation from q - there * will be nothing new on those as well. */ if (is_candidate(child)) { q = skip_propagation_subtree(q, p); continue; } child->mnt_t_flags |= T_UMOUNT_CANDIDATE; if (!will_be_unmounted(child)) list_add(&child->mnt_list, candidates); } q = propagation_next(q, p); } } list_for_each_entry(m, set, mnt_list) m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } /* * We know that some child of @m can't be unmounted. In all places where the * chain of descent of @m has child not overmounting the root of parent, * the parent can't be unmounted either. */ static void trim_ancestors(struct mount *m) { struct mount *p; for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) { if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts return; SET_MNT_MARK(m); if (m != p->overmount) p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } } /* * Find and exclude all umount candidates forbidden by @m * (see Documentation/filesystems/propagate_umount.txt) * If we can immediately tell that @m is OK to unmount (unlocked * and all children are already committed to unmounting) commit * to unmounting it. * Only @m itself might be taken from the candidates list; * anything found by trim_ancestors() is marked non-candidate * and left on the list. */ static void trim_one(struct mount *m, struct list_head *to_umount) { bool remove_this = false, found = false, umount_this = false; struct mount *n; if (!is_candidate(m)) { // trim_ancestors() left it on list remove_from_candidate_list(m); return; } list_for_each_entry(n, &m->mnt_mounts, mnt_child) { if (!is_candidate(n)) { found = true; if (n != m->overmount) { remove_this = true; break; } } } if (found) { trim_ancestors(m); } else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) { remove_this = true; umount_this = true; } if (remove_this) { remove_from_candidate_list(m); if (umount_this) umount_one(m, to_umount); } } static void handle_locked(struct mount *m, struct list_head *to_umount) { struct mount *cutoff = m, *p; if (!is_candidate(m)) { // trim_ancestors() left it on list remove_from_candidate_list(m); return; } for (p = m; is_candidate(p); p = p->mnt_parent) { remove_from_candidate_list(p); if (!IS_MNT_LOCKED(p)) cutoff = p->mnt_parent; } if (will_be_unmounted(p)) cutoff = p; while (m != cutoff) { umount_one(m, to_umount); m = m->mnt_parent; } } /* * @m is not to going away, and it overmounts the top of a stack of mounts * that are going away. We know that all of those are fully overmounted * by the one above (@m being the topmost of the chain), so @m can be slid * in place where the bottom of the stack is attached. * * NOTE: here we temporarily violate a constraint - two mounts end up with * the same parent and mountpoint; that will be remedied as soon as we * return from propagate_umount() - its caller (umount_tree()) will detach * the stack from the parent it (and now @m) is attached to. umount_tree() * might choose to keep unmounted pieces stuck to each other, but it always * detaches them from the mounts that remain in the tree. */ static void reparent(struct mount *m) { struct mount *p = m; struct mountpoint *mp; do { mp = p->mnt_mp; p = p->mnt_parent; } while (will_be_unmounted(p)); mnt_change_mountpoint(p, mp, m); mnt_notify_add(m); } /** * propagate_umount - apply propagation rules to the set of mounts for umount() * @set: the list of mounts to be unmounted. * * Collect all mounts that receive propagation from the mount in @set and have * no obstacles to being unmounted. Add these additional mounts to the set. * * See Documentation/filesystems/propagate_umount.txt if you do anything in * this area. * * Locks held: * mount_lock (write_seqlock), namespace_sem (exclusive). */ void propagate_umount(struct list_head *set) { struct mount *m, *p; LIST_HEAD(to_umount); // committed to unmounting LIST_HEAD(candidates); // undecided umount candidates // collect all candidates gather_candidates(set, &candidates); // reduce the set until it's non-shifting list_for_each_entry_safe(m, p, &candidates, mnt_list) trim_one(m, &to_umount); // ... and non-revealing while (!list_empty(&candidates)) { m = list_first_entry(&candidates,struct mount, mnt_list); handle_locked(m, &to_umount); } // now to_umount consists of all acceptable candidates // deal with reparenting of surviving overmounts on those list_for_each_entry(m, &to_umount, mnt_list) { struct mount *over = m->overmount; if (over && !will_be_unmounted(over)) reparent(over); } // and fold them into the set list_splice_tail_init(&to_umount, set); }
4 4 4 6 6 1 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/rose.h> static int rose_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); if (daddr) memcpy(buff + 7, daddr, dev->addr_len); *buff++ = ROSE_GFI | ROSE_Q_BIT; *buff++ = 0x00; *buff++ = ROSE_DATA; *buff++ = 0x7F; *buff++ = AX25_P_IP; if (daddr != NULL) return 37; return -37; } static int rose_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = rose_add_loopback_node((rose_address *)sa->sa_data); if (err) return err; rose_del_loopback_node((const rose_address *)dev->dev_addr); } dev_addr_set(dev, sa->sa_data); return 0; } static int rose_open(struct net_device *dev) { int err; err = rose_add_loopback_node((const rose_address *)dev->dev_addr); if (err) return err; netif_start_queue(dev); return 0; } static int rose_close(struct net_device *dev) { netif_stop_queue(dev); rose_del_loopback_node((const rose_address *)dev->dev_addr); return 0; } static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!netif_running(dev)) { printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); return NETDEV_TX_BUSY; } if (!rose_route_frame(skb, NULL)) { dev_kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops rose_header_ops = { .create = rose_header, }; static const struct net_device_ops rose_netdev_ops = { .ndo_open = rose_open, .ndo_stop = rose_close, .ndo_start_xmit = rose_xmit, .ndo_set_mac_address = rose_set_mac_address, }; void rose_setup(struct net_device *dev) { dev->mtu = ROSE_MAX_PACKET_SIZE - 2; dev->netdev_ops = &rose_netdev_ops; dev->header_ops = &rose_header_ops; dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; dev->addr_len = ROSE_ADDR_LEN; dev->type = ARPHRD_ROSE; /* New-style flags. */ dev->flags = IFF_NOARP; }
2 3 3 1 1 1 1 3 3 3 27 27 27 27 27 23 23 23 23 23 8 8 8 8 8 4 4 3 4 4 4 3 3 3 3 3 3 3 3 3 2 2 3 2 1 1 1 3 4 4 3 5 1 1 1 1 10 10 9 8 8 2 1 6 8 7 2 7 10 3 2 1 1 3 6 5 4 3 2 6 4 3 2 1 4 6 6 5 4 1 4 3 2 6 3 2 1 3 1 1 1 1 1 3 2 1 1 1 3 8 7 6 2 4 5 2 4 3 3 2 2 3 2 3 1 3 3 8 4 3 2 1 1 3 4 3 2 1 4 5 4 3 2 5 5 4 3 2 5 2 2 2 2 2 2 2 2 2 2 4 4 3 2 1 1 4 5 4 3 2 1 1 5 408 527 408 408 408 527 23 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * Vendor commands implementation based on net/wireless/nl80211.c * which is: * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/genetlink.h> #include <linux/nfc.h> #include <linux/slab.h> #include "nfc.h" #include "llcp.h" static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfc_genl_family, flags, NFC_CMD_GET_TARGET); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) || nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res)) goto nla_put_failure; if (target->nfcid1_len > 0 && nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len, target->nfcid1)) goto nla_put_failure; if (target->sensb_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len, target->sensb_res)) goto nla_put_failure; if (target->sensf_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len, target->sensf_res)) goto nla_put_failure; if (target->is_iso15693) { if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, target->iso15693_dsfid) || nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, sizeof(target->iso15693_uid), target->iso15693_uid)) goto nla_put_failure; } if (target->ats_len > 0 && nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, target->ats)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; u32 idx; if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return ERR_PTR(-ENODEV); return dev; } static int nfc_genl_dump_targets(struct sk_buff *skb, struct netlink_callback *cb) { int i = cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; int rc; if (!dev) { dev = __get_device_from_cb(cb); if (IS_ERR(dev)) return PTR_ERR(dev); cb->args[1] = (long) dev; } device_lock(&dev->dev); cb->seq = dev->targets_generation; while (i < dev->n_targets) { rc = nfc_genl_send_target(skb, &dev->targets[i], cb, NLM_F_MULTI); if (rc < 0) break; i++; } device_unlock(&dev->dev); cb->args[0] = i; return skb->len; } static int nfc_genl_dump_targets_done(struct netlink_callback *cb) { struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; if (dev) nfc_put_device(dev); return 0; } int nfc_genl_targets_found(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; dev->genl_data.poll_req_portid = 0; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGETS_FOUND); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGET_LOST); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_ACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_deactivated(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_DEACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) { if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) return -1; return 0; } int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_ADDED); if (!hdr) goto free_msg; if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_device_removed(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) { struct sk_buff *msg; struct nlattr *sdp_attr, *uri_attr; struct nfc_llcp_sdp_tlv *sdres; struct hlist_node *n; void *hdr; int rc = -EMSGSIZE; int i; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_LLC_SDRES); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } i = 1; hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) goto nla_put_failure; if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) goto nla_put_failure; nla_nest_end(msg, uri_attr); hlist_del(&sdres->node); nfc_llcp_free_sdp_tlv(sdres); } nla_nest_end(msg, sdp_attr); genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); nfc_llcp_free_sdp_tlv_list(sdres_list); return rc; } int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_ADDED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_TRANSACTION); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) || nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len, evt_transaction->aid) || nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len, evt_transaction->params)) goto nla_put_failure; /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { const struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_CONNECTIVITY); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_DEVICE); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is up\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (rf_mode == NFC_RF_INITIATOR && nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) || nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); dev->dep_link_up = true; genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_dep_link_down_event(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is down\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_DOWN); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct nfc_dev *dev; u32 idx; int rc = -ENOBUFS; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto out_putdev; } rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq, NULL, 0); if (rc < 0) goto out_free; nfc_put_device(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_putdev: nfc_put_device(dev); return rc; } static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_up(dev); nfc_put_device(dev); return rc; } static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; u32 im_protocols = 0, tm_protocols = 0; pr_debug("Poll start\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] && !info->attrs[NFC_ATTR_PROTOCOLS]) && !info->attrs[NFC_ATTR_TM_PROTOCOLS])) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (info->attrs[NFC_ATTR_TM_PROTOCOLS]) tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]); if (info->attrs[NFC_ATTR_IM_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]); else if (info->attrs[NFC_ATTR_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; mutex_lock(&dev->genl_data.genl_data_mutex); rc = nfc_start_poll(dev, im_protocols, tm_protocols); if (!rc) dev->genl_data.poll_req_portid = info->snd_portid; mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (!dev->polling) { device_unlock(&dev->dev); nfc_put_device(dev); return -EINVAL; } device_unlock(&dev->dev); mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid != info->snd_portid) { rc = -EBUSY; goto out; } rc = nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; out: mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx, protocol; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX] || !info->attrs[NFC_ATTR_PROTOCOLS]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); return rc; } static int nfc_genl_deactivate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc, tgt_idx; u32 idx; u8 comm; pr_debug("DEP link up\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_COMM_MODE]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (!info->attrs[NFC_ATTR_TARGET_INDEX]) tgt_idx = NFC_TARGET_IDX_ANY; else tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]); if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE) return -EINVAL; dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_up(dev, tgt_idx, comm); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_send_params(struct sk_buff *msg, struct nfc_llcp_local *local, u32 portid, u32 seq) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0, NFC_CMD_LLC_GET_PARAMS); if (!hdr) return -EMSGSIZE; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) || nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; int rc = 0; struct sk_buff *msg = NULL; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); if (rc < 0) { if (msg) nlmsg_free(msg); return rc; } return genlmsg_reply(msg, info); } static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; u8 rw = 0; u16 miux = 0; u32 idx; int rc = 0; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] && !info->attrs[NFC_ATTR_LLC_PARAM_RW] && !info->attrs[NFC_ATTR_LLC_PARAM_MIUX])) return -EINVAL; if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) { rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]); if (rw > LLCP_MAX_RW) return -EINVAL; } if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) { miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]); if (miux > LLCP_MAX_MIUX) return -EINVAL; } idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); } if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) local->rw = rw; if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; u32 idx; u8 tid; char *uri; int rc = 0, rem; size_t uri_len, tlvs_len; struct hlist_head sdreq_list; struct nfc_llcp_sdp_tlv *sdreq; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_LLC_SDP]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (dev->dep_link_up == false) { rc = -ENOLINK; goto exit; } local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } INIT_HLIST_HEAD(&sdreq_list); tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, attr, nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) continue; uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri_len == 0) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); if (*uri == 0) continue; tid = local->sdreq_next_tid++; sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; goto put_local; } tlvs_len += sdreq->tlv_len; hlist_add_head(&sdreq->node, &sdreq_list); } if (hlist_empty(&sdreq_list)) { rc = -EINVAL; goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_enable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_disable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct nfc_se *se, *n; list_for_each_entry_safe(se, n, &dev->secure_elements, list) { hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_SE); if (!hdr) goto nla_put_failure; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); } return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_ses(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->se_io) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state != NFC_SE_ENABLED) { rc = -ENODEV; goto error; } rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); device_unlock(&dev->dev); return rc; error: device_unlock(&dev->dev); kfree(cb_context); return rc; } struct se_io_ctx { u32 dev_idx; u32 se_idx; }; static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) { struct se_io_ctx *ctx = context; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { kfree(ctx); return; } hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_SE_IO); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) || nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); kfree(ctx); return; nla_put_failure: free_msg: nlmsg_free(msg); kfree(ctx); return; } static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct se_io_ctx *ctx; u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || !info->attrs[NFC_ATTR_SE_APDU]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->ops || !dev->ops->se_io) { rc = -EOPNOTSUPP; goto put_dev; } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); if (apdu_len == 0) { rc = -EINVAL; goto put_dev; } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto put_dev; } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); put_dev: nfc_put_device(dev); return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->vendor_cmds || !dev->n_vendor_cmds) { err = -ENODEV; goto put_dev; } if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) { err = -EINVAL; goto put_dev; } } else { data = NULL; data_len = 0; } for (i = 0; i < dev->n_vendor_cmds; i++) { cmd = &dev->vendor_cmds[i]; if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; goto put_dev; } err = -EOPNOTSUPP; put_dev: nfc_put_device(dev); return err; } /* message building helper */ static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); } static struct sk_buff * __nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, u32 portid, u32 seq, enum nfc_attrs attr, u32 oui, u32 subcmd, gfp_t gfp) { struct sk_buff *skb; void *hdr; skb = nlmsg_new(approxlen + 100, gfp); if (!skb) return NULL; hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); if (!hdr) { kfree_skb(skb); return NULL; } if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) goto nla_put_failure; ((void **)skb->cb)[0] = dev; ((void **)skb->cb)[1] = hdr; return skb; nla_put_failure: kfree_skb(skb); return NULL; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen) { if (WARN_ON(!dev->cur_cmd_info)) return NULL; return __nfc_alloc_vendor_cmd_skb(dev, approxlen, dev->cur_cmd_info->snd_portid, dev->cur_cmd_info->snd_seq, attr, oui, subcmd, GFP_KERNEL); } EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); int nfc_vendor_cmd_reply(struct sk_buff *skb) { struct nfc_dev *dev = ((void **)skb->cb)[0]; void *hdr = ((void **)skb->cb)[1]; /* clear CB data for netlink core to own from now on */ memset(skb->cb, 0, sizeof(skb->cb)); if (WARN_ON(!dev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; } genlmsg_end(skb, hdr); return genlmsg_reply(skb, dev->cur_cmd_info); } EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, }, { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, }, { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1, .mcgrps = nfc_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), }; struct urelease_work { struct work_struct w; u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) { struct urelease_work *w = container_of(work, struct urelease_work, w); struct class_dev_iter iter; struct nfc_dev *dev; pr_debug("portid %d\n", w->portid); mutex_lock(&nfc_devlist_mutex); nfc_device_iter_init(&iter); dev = nfc_device_iter_next(&iter); while (dev) { mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid == w->portid) { nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; } mutex_unlock(&dev->genl_data.genl_data_mutex); dev = nfc_device_iter_next(&iter); } nfc_device_iter_exit(&iter); mutex_unlock(&nfc_devlist_mutex); kfree(w); } static int nfc_genl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct urelease_work *w; if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC) goto out; pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; schedule_work(&w->w); } out: return NOTIFY_DONE; } void nfc_genl_data_init(struct nfc_genl_data *genl_data) { genl_data->poll_req_portid = 0; mutex_init(&genl_data->genl_data_mutex); } void nfc_genl_data_exit(struct nfc_genl_data *genl_data) { mutex_destroy(&genl_data->genl_data_mutex); } static struct notifier_block nl_notifier = { .notifier_call = nfc_genl_rcv_nl_event, }; /** * nfc_genl_init() - Initialize netlink interface * * This initialization function registers the nfc netlink family. */ int __init nfc_genl_init(void) { int rc; rc = genl_register_family(&nfc_genl_family); if (rc) return rc; netlink_register_notifier(&nl_notifier); return 0; } /** * nfc_genl_exit() - Deinitialize netlink interface * * This exit function unregisters the nfc netlink family. */ void nfc_genl_exit(void) { netlink_unregister_notifier(&nl_notifier); genl_unregister_family(&nfc_genl_family); }
1 3 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 2 1 2 1 1 2 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 // SPDX-License-Identifier: GPL-2.0-only /* * TCP Illinois congestion control. * Home page: * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html * * The algorithm is described in: * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm * for High-Speed Networks" * http://tamerbasar.csl.illinois.edu/LiuBasarSrikantPerfEvalArtJun2008.pdf * * Implemented from description in paper and ns-2 simulation. * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <asm/div64.h> #include <net/tcp.h> #define ALPHA_SHIFT 7 #define ALPHA_SCALE (1u<<ALPHA_SHIFT) #define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */ #define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */ #define ALPHA_BASE ALPHA_SCALE /* 1.0 */ #define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */ #define BETA_SHIFT 6 #define BETA_SCALE (1u<<BETA_SHIFT) #define BETA_MIN (BETA_SCALE/8) /* 0.125 */ #define BETA_MAX (BETA_SCALE/2) /* 0.5 */ #define BETA_BASE BETA_MAX static int win_thresh __read_mostly = 15; module_param(win_thresh, int, 0); MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing"); static int theta __read_mostly = 5; module_param(theta, int, 0); MODULE_PARM_DESC(theta, "# of fast RTT's before full growth"); /* TCP Illinois Parameters */ struct illinois { u64 sum_rtt; /* sum of rtt's measured within last rtt */ u16 cnt_rtt; /* # of rtts measured within last rtt */ u32 base_rtt; /* min of all rtt in usec */ u32 max_rtt; /* max of all rtt in usec */ u32 end_seq; /* right edge of current RTT */ u32 alpha; /* Additive increase */ u32 beta; /* Muliplicative decrease */ u16 acked; /* # packets acked by current ACK */ u8 rtt_above; /* average rtt has gone above threshold */ u8 rtt_low; /* # of rtts measurements below threshold */ }; static void rtt_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); ca->end_seq = tp->snd_nxt; ca->cnt_rtt = 0; ca->sum_rtt = 0; /* TODO: age max_rtt? */ } static void tcp_illinois_init(struct sock *sk) { struct illinois *ca = inet_csk_ca(sk); ca->alpha = ALPHA_MAX; ca->beta = BETA_BASE; ca->base_rtt = 0x7fffffff; ca->max_rtt = 0; ca->acked = 0; ca->rtt_low = 0; ca->rtt_above = 0; rtt_reset(sk); } /* Measure RTT for each ack. */ static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample) { struct illinois *ca = inet_csk_ca(sk); s32 rtt_us = sample->rtt_us; ca->acked = sample->pkts_acked; /* dup ack, no rtt sample */ if (rtt_us < 0) return; /* ignore bogus values, this prevents wraparound in alpha math */ if (rtt_us > RTT_MAX) rtt_us = RTT_MAX; /* keep track of minimum RTT seen so far */ if (ca->base_rtt > rtt_us) ca->base_rtt = rtt_us; /* and max */ if (ca->max_rtt < rtt_us) ca->max_rtt = rtt_us; ++ca->cnt_rtt; ca->sum_rtt += rtt_us; } /* Maximum queuing delay */ static inline u32 max_delay(const struct illinois *ca) { return ca->max_rtt - ca->base_rtt; } /* Average queuing delay */ static inline u32 avg_delay(const struct illinois *ca) { u64 t = ca->sum_rtt; do_div(t, ca->cnt_rtt); return t - ca->base_rtt; } /* * Compute value of alpha used for additive increase. * If small window then use 1.0, equivalent to Reno. * * For larger windows, adjust based on average delay. * A. If average delay is at minimum (we are uncongested), * then use large alpha (10.0) to increase faster. * B. If average delay is at maximum (getting congested) * then use small alpha (0.3) * * The result is a convex window growth curve. */ static u32 alpha(struct illinois *ca, u32 da, u32 dm) { u32 d1 = dm / 100; /* Low threshold */ if (da <= d1) { /* If never got out of low delay zone, then use max */ if (!ca->rtt_above) return ALPHA_MAX; /* Wait for 5 good RTT's before allowing alpha to go alpha max. * This prevents one good RTT from causing sudden window increase. */ if (++ca->rtt_low < theta) return ca->alpha; ca->rtt_low = 0; ca->rtt_above = 0; return ALPHA_MAX; } ca->rtt_above = 1; /* * Based on: * * (dm - d1) amin amax * k1 = ------------------- * amax - amin * * (dm - d1) amin * k2 = ---------------- - d1 * amax - amin * * k1 * alpha = ---------- * k2 + da */ dm -= d1; da -= d1; return (dm * ALPHA_MAX) / (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN); } /* * Beta used for multiplicative decrease. * For small window sizes returns same value as Reno (0.5) * * If delay is small (10% of max) then beta = 1/8 * If delay is up to 80% of max then beta = 1/2 * In between is a linear function */ static u32 beta(u32 da, u32 dm) { u32 d2, d3; d2 = dm / 10; if (da <= d2) return BETA_MIN; d3 = (8 * dm) / 10; if (da >= d3 || d3 <= d2) return BETA_MAX; /* * Based on: * * bmin d3 - bmax d2 * k3 = ------------------- * d3 - d2 * * bmax - bmin * k4 = ------------- * d3 - d2 * * b = k3 + k4 da */ return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da) / (d3 - d2); } /* Update alpha and beta values once per RTT */ static void update_params(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); if (tcp_snd_cwnd(tp) < win_thresh) { ca->alpha = ALPHA_BASE; ca->beta = BETA_BASE; } else if (ca->cnt_rtt > 0) { u32 dm = max_delay(ca); u32 da = avg_delay(ca); ca->alpha = alpha(ca, da, dm); ca->beta = beta(da, dm); } rtt_reset(sk); } /* * In case of loss, reset to default values */ static void tcp_illinois_state(struct sock *sk, u8 new_state) { struct illinois *ca = inet_csk_ca(sk); if (new_state == TCP_CA_Loss) { ca->alpha = ALPHA_BASE; ca->beta = BETA_BASE; ca->rtt_low = 0; ca->rtt_above = 0; rtt_reset(sk); } } /* * Increase window in response to successful acknowledgment. */ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); if (after(ack, ca->end_seq)) update_params(sk); /* RFC2861 only increase cwnd if fully utilized */ if (!tcp_is_cwnd_limited(sk)) return; /* In slow start */ if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { u32 delta; /* snd_cwnd_cnt is # of packets since last cwnd increment */ tp->snd_cwnd_cnt += ca->acked; ca->acked = 1; /* This is close approximation of: * tp->snd_cwnd += alpha/tp->snd_cwnd */ delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; if (delta >= tcp_snd_cwnd(tp)) { tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp) + delta / tcp_snd_cwnd(tp), (u32)tp->snd_cwnd_clamp)); tp->snd_cwnd_cnt = 0; } } } static u32 tcp_illinois_ssthresh(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); u32 decr; /* Multiplicative decrease */ decr = (tcp_snd_cwnd(tp) * ca->beta) >> BETA_SHIFT; return max(tcp_snd_cwnd(tp) - decr, 2U); } /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct illinois *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = 1; info->vegas.tcpv_rttcnt = ca->cnt_rtt; info->vegas.tcpv_minrtt = ca->base_rtt; info->vegas.tcpv_rtt = 0; if (info->vegas.tcpv_rttcnt > 0) { u64 t = ca->sum_rtt; do_div(t, info->vegas.tcpv_rttcnt); info->vegas.tcpv_rtt = t; } *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); } return 0; } static struct tcp_congestion_ops tcp_illinois __read_mostly = { .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_illinois_cong_avoid, .set_state = tcp_illinois_state, .get_info = tcp_illinois_info, .pkts_acked = tcp_illinois_acked, .owner = THIS_MODULE, .name = "illinois", }; static int __init tcp_illinois_register(void) { BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_illinois); } static void __exit tcp_illinois_unregister(void) { tcp_unregister_congestion_control(&tcp_illinois); } module_init(tcp_illinois_register); module_exit(tcp_illinois_unregister); MODULE_AUTHOR("Stephen Hemminger, Shao Liu"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Illinois"); MODULE_VERSION("1.0");
17 17 17 4 17 75 75 40 76 75 76 76 75 82 35 5 35 35 34 34 34 34 34 35 42 42 42 1 42 1 1 42 1 42 12 42 42 42 42 42 42 6 1 6 6 6 6 6 21 20 19 1 1 19 19 19 19 19 19 21 4 6 13 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 // SPDX-License-Identifier: GPL-2.0 /* * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} * * In fact, that's a piece of procfs; it's *almost* isolated from * the rest of fs/proc, but has rather close relationships with * fs/namespace.c, thus here instead of fs/proc * */ #include <linux/mnt_namespace.h> #include <linux/nsproxy.h> #include <linux/security.h> #include <linux/fs_struct.h> #include <linux/sched/task.h> #include "proc/internal.h" /* only for get_proc_task() in ->open() */ #include "pnode.h" #include "internal.h" static __poll_t mounts_poll(struct file *file, poll_table *wait) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; __poll_t res = EPOLLIN | EPOLLRDNORM; int event; poll_wait(file, &p->ns->poll, wait); event = READ_ONCE(ns->event); if (m->poll_event != event) { m->poll_event = event; res |= EPOLLERR | EPOLLPRI; } return res; } struct proc_fs_opts { int flag; const char *str; }; static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_opts fs_opts[] = { { SB_SYNCHRONOUS, ",sync" }, { SB_DIRSYNC, ",dirsync" }, { SB_MANDLOCK, ",mand" }, { SB_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; for (fs_infop = fs_opts; fs_infop->flag; fs_infop++) { if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } return security_sb_show_options(m, sb); } static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt) { static const struct proc_fs_opts mnt_opts[] = { { MNT_NOSUID, ",nosuid" }, { MNT_NODEV, ",nodev" }, { MNT_NOEXEC, ",noexec" }, { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; for (fs_infop = mnt_opts; fs_infop->flag; fs_infop++) { if (mnt->mnt_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } if (is_idmapped_mnt(mnt)) seq_puts(m, ",idmapped"); } static inline void mangle(struct seq_file *m, const char *s) { seq_escape(m, s, " \t\n\\#"); } static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } } static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; int err; if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { mangle(m, r->mnt_devname); } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_putc(m, ' '); show_type(m, sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; show_vfsmnt_opts(m, mnt); if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt_path.dentry); seq_puts(m, " 0 0\n"); out: return err; } static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); err = show_path(m, mnt->mnt_root); if (err) goto out; seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); show_vfsmnt_opts(m, mnt); /* Tagged fields ("foo:X" or "bar") */ if (IS_MNT_SHARED(r)) seq_printf(m, " shared:%i", r->mnt_group_id); if (IS_MNT_SLAVE(r)) { int master = r->mnt_master->mnt_group_id; int dom = get_dominating_id(r, &p->root); seq_printf(m, " master:%i", master); if (dom && dom != master) seq_printf(m, " propagate_from:%i", dom); } if (IS_MNT_UNBINDABLE(r)) seq_puts(m, " unbindable"); /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); seq_putc(m, ' '); if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt->mnt_root); if (err) goto out; } else { mangle(m, r->mnt_devname); } seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt->mnt_root); seq_putc(m, '\n'); out: return err; } static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; int err; /* device */ seq_puts(m, "device "); if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { mangle(m, r->mnt_devname); } /* mount point */ seq_puts(m, " mounted on "); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; seq_putc(m, ' '); /* file system type */ seq_puts(m, "with fstype "); show_type(m, sb); /* optional statistics */ if (sb->s_op->show_stats) { seq_putc(m, ' '); err = sb->s_op->show_stats(m, mnt_path.dentry); } seq_putc(m, '\n'); out: return err; } static int mounts_open_common(struct inode *inode, struct file *file, int (*show)(struct seq_file *, struct vfsmount *)) { struct task_struct *task = get_proc_task(inode); struct nsproxy *nsp; struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; struct seq_file *m; int ret = -EINVAL; if (!task) goto err; task_lock(task); nsp = task->nsproxy; if (!nsp || !nsp->mnt_ns) { task_unlock(task); put_task_struct(task); goto err; } ns = nsp->mnt_ns; get_mnt_ns(ns); if (!task->fs) { task_unlock(task); put_task_struct(task); ret = -ENOENT; goto err_put_ns; } get_fs_root(task->fs, &root); task_unlock(task); put_task_struct(task); ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); if (ret) goto err_put_path; m = file->private_data; m->poll_event = ns->event; p = m->private; p->ns = ns; p->root = root; p->show = show; return 0; err_put_path: path_put(&root); err_put_ns: put_mnt_ns(ns); err: return ret; } static int mounts_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_vfsmnt); } static int mountinfo_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_mountinfo); } static int mountstats_open(struct inode *inode, struct file *file) { return mounts_open_common(inode, file, show_vfsstat); } const struct file_operations proc_mounts_operations = { .open = mounts_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, }; const struct file_operations proc_mountinfo_operations = { .open = mountinfo_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, }; const struct file_operations proc_mountstats_operations = { .open = mountstats_open, .read_iter = seq_read_iter, .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, };
15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Michal Ludvig <michal@logix.cz> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <linux/if_packet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netfilter/xt_pkttype.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); MODULE_DESCRIPTION("Xtables: link layer packet type match"); MODULE_ALIAS("ipt_pkttype"); MODULE_ALIAS("ip6t_pkttype"); static bool pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_pkttype_info *info = par->matchinfo; u_int8_t type; if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; else if (xt_family(par) == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; else if (xt_family(par) == NFPROTO_IPV6) type = PACKET_MULTICAST; else type = PACKET_BROADCAST; return (type == info->pkttype) ^ info->invert; } static struct xt_match pkttype_mt_reg __read_mostly = { .name = "pkttype", .revision = 0, .family = NFPROTO_UNSPEC, .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, }; static int __init pkttype_mt_init(void) { return xt_register_match(&pkttype_mt_reg); } static void __exit pkttype_mt_exit(void) { xt_unregister_match(&pkttype_mt_reg); } module_init(pkttype_mt_init); module_exit(pkttype_mt_exit);
10 8 8 7 10 6 5 5 1 1 5 7 4 9 8 2 2 2 2 17 17 17 11 11 3 6 9 9 13 17 4 1 3 2 4 29 9 29 6 8 28 4 28 1 28 9 28 1 27 22 5 22 3 6 4 6 1 4 3 27 7 6 18 18 5 5 4 6 5 6 4 4 4 4 4 2 2 4 8 8 1 2 2 1 4 1 4 4 4 6 6 6 2 2 2 2 4 4 4 4 16 16 16 26 26 26 24 24 4 1 2 19 18 18 17 17 14 13 17 72 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/genetlink.h> #include <net/gro.h> #include <net/gue.h> #include <net/fou.h> #include <net/ip.h> #include <net/protocol.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <uapi/linux/fou.h> #include <uapi/linux/genetlink.h> #include "fou_nl.h" struct fou { struct socket *sock; u8 protocol; u8 flags; __be16 port; u8 family; u16 type; struct list_head list; struct rcu_head rcu; }; #define FOU_F_REMCSUM_NOPARTIAL BIT(0) struct fou_cfg { u16 type; u8 protocol; u8 flags; struct udp_port_cfg udp_config; }; static unsigned int fou_net_id; struct fou_net { struct list_head fou_list; struct mutex fou_lock; }; static inline struct fou *fou_from_sock(struct sock *sk) { return rcu_dereference_sk_user_data(sk); } static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len) { /* Remove 'len' bytes from the packet (UDP header and * FOU header if present). */ if (fou->family == AF_INET) ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); else ipv6_hdr(skb)->payload_len = htons(ntohs(ipv6_hdr(skb)->payload_len) - len); __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); return iptunnel_pull_offloads(skb); } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); if (!fou) return 1; if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) goto drop; return -fou->protocol; drop: kfree_skb(skb); return 0; } static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, void *data, size_t hdrlen, u8 ipproto, bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); size_t plen = sizeof(struct udphdr) + hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) return guehdr; if (!pskb_may_pull(skb, plen)) return NULL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset, nopartial); return guehdr; } static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) { /* No support yet */ kfree_skb(skb); return 0; } static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); size_t len, optlen, hdrlen; struct guehdr *guehdr; void *data; u16 doffset = 0; u8 proto_ctype; if (!fou) return 1; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (!pskb_may_pull(skb, len)) goto drop; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; switch (guehdr->version) { case 0: /* Full GUE header present */ break; case 1: { /* Direct encapsulation of IPv4 or IPv6 */ int prot; switch (((struct iphdr *)guehdr)->version) { case 4: prot = IPPROTO_IPIP; break; case 6: prot = IPPROTO_IPV6; break; default: goto drop; } if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) goto drop; return -prot; } default: /* Undefined version */ goto drop; } optlen = guehdr->hlen << 2; len += optlen; if (!pskb_may_pull(skb, len)) goto drop; /* guehdr may change after pull */ guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) goto drop; hdrlen = sizeof(struct guehdr) + optlen; if (fou->family == AF_INET) ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); else ipv6_hdr(skb)->payload_len = htons(ntohs(ipv6_hdr(skb)->payload_len) - len); /* Pull csum through the guehdr now . This can be used if * there is a remote checksum offload. */ skb_postpull_rcsum(skb, udp_hdr(skb), len); data = &guehdr[1]; if (guehdr->flags & GUE_FLAG_PRIV) { __be32 flags = *(__be32 *)(data + doffset); doffset += GUE_LEN_PRIV; if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_remcsum(skb, guehdr, data + doffset, hdrlen, guehdr->proto_ctype, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); if (!guehdr) goto drop; data = &guehdr[1]; doffset += GUE_PLEN_REMCSUM; } } if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); proto_ctype = guehdr->proto_ctype; __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); if (iptunnel_pull_offloads(skb)) goto drop; return -proto_ctype; drop: kfree_skb(skb); return 0; } static const struct net_offload *fou_gro_ops(const struct sock *sk, int proto) { const struct net_offload __rcu **offloads; /* FOU doesn't allow IPv4 on IPv6 sockets. */ offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads; return rcu_dereference(offloads[proto]); } static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; if (!fou) goto out; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ NAPI_GRO_CB(skb)->encap_mark = 0; /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; ops = fou_gro_ops(sk, fou->protocol); if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out: return pp; } static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; int err; if (!fou) { err = -ENOENT; goto out; } ops = fou_gro_ops(sk, fou->protocol); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; } err = ops->callbacks.gro_complete(skb, nhoff); skb_set_inner_mac_header(skb, nhoff); out: return err; } static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, size_t hdrlen, struct gro_remcsum *grc, bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); if (skb->remcsum_offload) return guehdr; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen, start, offset, grc, nopartial); skb->remcsum_offload = 1; return guehdr; } static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; void *data; u16 doffset = 0; int flush = 1; struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; u8 proto; skb_gro_remcsum_init(&grc); if (!fou) goto out; off = skb_gro_offset(skb); len = off + sizeof(*guehdr); guehdr = skb_gro_header(skb, len, off); if (unlikely(!guehdr)) goto out; switch (guehdr->version) { case 0: break; case 1: switch (((struct iphdr *)guehdr)->version) { case 4: proto = IPPROTO_IPIP; break; case 6: proto = IPPROTO_IPV6; break; default: goto out; } goto next_proto; default: goto out; } optlen = guehdr->hlen << 2; len += optlen; if (!skb_gro_may_pull(skb, len)) { guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; } if (unlikely(guehdr->control) || guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto out; hdrlen = sizeof(*guehdr) + optlen; /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr, * this is needed if there is a remote checkcsum offload. */ skb_gro_postpull_rcsum(skb, guehdr, hdrlen); data = &guehdr[1]; if (guehdr->flags & GUE_FLAG_PRIV) { __be32 flags = *(__be32 *)(data + doffset); doffset += GUE_LEN_PRIV; if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, data + doffset, hdrlen, &grc, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); if (!guehdr) goto out; data = &guehdr[1]; doffset += GUE_PLEN_REMCSUM; } } skb_gro_pull(skb, hdrlen); list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) continue; guehdr2 = (struct guehdr *)(p->data + off); /* Compare base GUE header to be equal (covers * hlen, version, proto_ctype, and flags. */ if (guehdr->word != guehdr2->word) { NAPI_GRO_CB(p)->same_flow = 0; continue; } /* Compare optional fields are the same. */ if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], guehdr->hlen << 2)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } proto = guehdr->proto_ctype; next_proto: /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ NAPI_GRO_CB(skb)->encap_mark = 0; /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; ops = fou_gro_ops(sk, proto); if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); flush = 0; out: skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; int err = -ENOENT; switch (guehdr->version) { case 0: proto = guehdr->proto_ctype; guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); break; case 1: switch (((struct iphdr *)guehdr)->version) { case 4: proto = IPPROTO_IPIP; break; case 6: proto = IPPROTO_IPV6; break; default: return err; } break; default: return err; } ops = fou_gro_ops(sk, proto); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = ops->callbacks.gro_complete(skb, nhoff + guehlen); skb_set_inner_mac_header(skb, nhoff + guehlen); out: return err; } static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg) { struct sock *sk = fou->sock->sk; struct udp_port_cfg *udp_cfg = &cfg->udp_config; if (fou->family != udp_cfg->family || fou->port != udp_cfg->local_udp_port || sk->sk_dport != udp_cfg->peer_udp_port || sk->sk_bound_dev_if != udp_cfg->bind_ifindex) return false; if (fou->family == AF_INET) { if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr || sk->sk_daddr != udp_cfg->peer_ip.s_addr) return false; else return true; #if IS_ENABLED(CONFIG_IPV6) } else { if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) || ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6)) return false; else return true; #endif } return false; } static int fou_add_to_port_list(struct net *net, struct fou *fou, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { if (fou_cfg_cmp(fout, cfg)) { mutex_unlock(&fn->fou_lock); return -EALREADY; } } list_add(&fou->list, &fn->fou_list); mutex_unlock(&fn->fou_lock); return 0; } static void fou_release(struct fou *fou) { struct socket *sock = fou->sock; list_del(&fou->list); udp_tunnel_sock_release(sock); kfree_rcu(fou, rcu); } static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { struct socket *sock = NULL; struct fou *fou = NULL; struct sock *sk; struct udp_tunnel_sock_cfg tunnel_cfg; int err; /* Open UDP socket */ err = udp_sock_create(net, &cfg->udp_config, &sock); if (err < 0) goto error; /* Allocate FOU port structure */ fou = kzalloc(sizeof(*fou), GFP_KERNEL); if (!fou) { err = -ENOMEM; goto error; } sk = sock->sk; fou->port = cfg->udp_config.local_udp_port; fou->family = cfg->udp_config.family; fou->flags = cfg->flags; fou->type = cfg->type; fou->sock = sock; memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.encap_type = 1; tunnel_cfg.sk_user_data = fou; tunnel_cfg.encap_destroy = NULL; /* Initial for fou type */ switch (cfg->type) { case FOU_ENCAP_DIRECT: tunnel_cfg.encap_rcv = fou_udp_recv; tunnel_cfg.gro_receive = fou_gro_receive; tunnel_cfg.gro_complete = fou_gro_complete; fou->protocol = cfg->protocol; break; case FOU_ENCAP_GUE: tunnel_cfg.encap_rcv = gue_udp_recv; tunnel_cfg.gro_receive = gue_gro_receive; tunnel_cfg.gro_complete = gue_gro_complete; break; default: err = -EINVAL; goto error; } setup_udp_tunnel_sock(net, sock, &tunnel_cfg); sk->sk_allocation = GFP_ATOMIC; err = fou_add_to_port_list(net, fou, cfg); if (err) goto error; if (sockp) *sockp = sock; return 0; error: kfree(fou); if (sock) udp_tunnel_sock_release(sock); return err; } static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { if (fou_cfg_cmp(fou, cfg)) { fou_release(fou); err = 0; break; } } mutex_unlock(&fn->fou_lock); return err; } static struct genl_family fou_nl_family; static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { bool has_local = false, has_peer = false; struct nlattr *attr; int ifindex; __be16 port; memset(cfg, 0, sizeof(*cfg)); cfg->udp_config.family = AF_INET; if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); switch (family) { case AF_INET: break; case AF_INET6: cfg->udp_config.ipv6_v6only = 1; break; default: return -EAFNOSUPPORT; } cfg->udp_config.family = family; } if (info->attrs[FOU_ATTR_PORT]) { port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); cfg->udp_config.local_udp_port = port; } if (info->attrs[FOU_ATTR_IPPROTO]) cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); if (info->attrs[FOU_ATTR_TYPE]) cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL]) cfg->flags |= FOU_F_REMCSUM_NOPARTIAL; if (cfg->udp_config.family == AF_INET) { if (info->attrs[FOU_ATTR_LOCAL_V4]) { attr = info->attrs[FOU_ATTR_LOCAL_V4]; cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr); has_local = true; } if (info->attrs[FOU_ATTR_PEER_V4]) { attr = info->attrs[FOU_ATTR_PEER_V4]; cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr); has_peer = true; } #if IS_ENABLED(CONFIG_IPV6) } else { if (info->attrs[FOU_ATTR_LOCAL_V6]) { attr = info->attrs[FOU_ATTR_LOCAL_V6]; cfg->udp_config.local_ip6 = nla_get_in6_addr(attr); has_local = true; } if (info->attrs[FOU_ATTR_PEER_V6]) { attr = info->attrs[FOU_ATTR_PEER_V6]; cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr); has_peer = true; } #endif } if (has_peer) { if (info->attrs[FOU_ATTR_PEER_PORT]) { port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]); cfg->udp_config.peer_udp_port = port; } else { return -EINVAL; } } if (info->attrs[FOU_ATTR_IFINDEX]) { if (!has_local) return -EINVAL; ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]); cfg->udp_config.bind_ifindex = ifindex; } return 0; } int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; int err; err = parse_nl_config(info, &cfg); if (err) return err; return fou_create(net, &cfg, NULL); } int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; int err; err = parse_nl_config(info, &cfg); if (err) return err; return fou_destroy(net, &cfg); } static int fou_fill_info(struct fou *fou, struct sk_buff *msg) { struct sock *sk = fou->sock->sk; if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) || nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) || nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if)) return -1; if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) return -1; if (fou->sock->sk->sk_family == AF_INET) { if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr)) return -1; if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr)) return -1; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6, &sk->sk_v6_rcv_saddr)) return -1; if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr)) return -1; #endif } return 0; } static int fou_dump_info(struct fou *fou, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (fou_fill_info(fou, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_net *fn = net_generic(net, fou_net_id); struct sk_buff *msg; struct fou_cfg cfg; struct fou *fout; __be16 port; u8 family; int ret; ret = parse_nl_config(info, &cfg); if (ret) return ret; port = cfg.udp_config.local_udp_port; if (port == 0) return -EINVAL; family = cfg.udp_config.family; if (family != AF_INET && family != AF_INET6) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = -ESRCH; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { if (fou_cfg_cmp(fout, &cfg)) { ret = fou_dump_info(fout, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); break; } } mutex_unlock(&fn->fou_lock); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; int idx = 0, ret; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { if (idx++ < cb->args[0]) continue; ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, FOU_CMD_GET); if (ret) break; } mutex_unlock(&fn->fou_lock); cb->args[0] = idx; return skb->len; } static struct genl_family fou_nl_family __ro_after_init = { .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, .maxattr = FOU_ATTR_MAX, .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = fou_nl_ops, .n_small_ops = ARRAY_SIZE(fou_nl_ops), .resv_start_op = FOU_CMD_GET + 1, }; size_t fou_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct udphdr); } EXPORT_SYMBOL(fou_encap_hlen); size_t gue_encap_hlen(struct ip_tunnel_encap *e) { size_t len; bool need_priv = false; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { len += GUE_PLEN_REMCSUM; need_priv = true; } len += need_priv ? GUE_LEN_PRIV : 0; return len; } EXPORT_SYMBOL(gue_encap_hlen); int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { int err; err = iptunnel_handle_offloads(skb, type); if (err) return err; *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); return 0; } EXPORT_SYMBOL(__fou_build_header); int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { struct guehdr *guehdr; size_t hdrlen, optlen = 0; void *data; bool need_priv = false; int err; if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; } optlen += need_priv ? GUE_LEN_PRIV : 0; err = iptunnel_handle_offloads(skb, type); if (err) return err; /* Get source port (based on flow hash) before skb_push */ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); hdrlen = sizeof(struct guehdr) + optlen; skb_push(skb, hdrlen); guehdr = (struct guehdr *)skb->data; guehdr->control = 0; guehdr->version = 0; guehdr->hlen = optlen >> 2; guehdr->flags = 0; guehdr->proto_ctype = *protocol; data = &guehdr[1]; if (need_priv) { __be32 *flags = data; guehdr->flags |= GUE_FLAG_PRIV; *flags = 0; data += GUE_LEN_PRIV; if (type & SKB_GSO_TUNNEL_REMCSUM) { u16 csum_start = skb_checksum_start_offset(skb); __be16 *pd = data; if (csum_start < hdrlen) return -EINVAL; csum_start -= hdrlen; pd[0] = htons(csum_start); pd[1] = htons(csum_start + skb->csum_offset); if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; skb->encapsulation = 0; } *flags |= GUE_PFLAG_REMCSUM; data += GUE_PLEN_REMCSUM; } } return 0; } EXPORT_SYMBOL(__gue_build_header); #ifdef CONFIG_NET_FOU_IP_TUNNELS static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, struct flowi4 *fl4, u8 *protocol, __be16 sport) { struct udphdr *uh; skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); *protocol = IPPROTO_UDP; } static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 sport; int err; err = __fou_build_header(skb, e, protocol, &sport, type); if (err) return err; fou_build_udp(skb, e, fl4, protocol, sport); return 0; } static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 sport; int err; err = __gue_build_header(skb, e, protocol, &sport, type); if (err) return err; fou_build_udp(skb, e, fl4, protocol, sport); return 0; } static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) { const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); if (ipprot && ipprot->err_handler) { if (!ipprot->err_handler(skb, info)) return 0; } return -ENOENT; } static int gue_err(struct sk_buff *skb, u32 info) { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; size_t len, optlen; int ret; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (!pskb_may_pull(skb, transport_offset + len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; switch (guehdr->version) { case 0: /* Full GUE header present */ break; case 1: { /* Direct encapsulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); switch (((struct iphdr *)guehdr)->version) { case 4: ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); goto out; #if IS_ENABLED(CONFIG_IPV6) case 6: ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); goto out; #endif default: ret = -EOPNOTSUPP; goto out; } } default: /* Undefined version */ return -EOPNOTSUPP; } if (guehdr->control) return -ENOENT; optlen = guehdr->hlen << 2; if (!pskb_may_pull(skb, transport_offset + len + optlen)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; /* Handling exceptions for direct UDP encapsulation in GUE would lead to * recursion. Besides, this kind of encapsulation can't even be * configured currently. Discard this. */ if (guehdr->proto_ctype == IPPROTO_UDP || guehdr->proto_ctype == IPPROTO_UDPLITE) return -EOPNOTSUPP; skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); out: skb_set_transport_header(skb, transport_offset); return ret; } static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, .err_handler = gue_err, }; static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, .err_handler = gue_err, }; static int ip_tunnel_encap_add_fou_ops(void) { int ret; ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); if (ret < 0) { pr_err("can't add fou ops\n"); return ret; } ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); if (ret < 0) { pr_err("can't add gue ops\n"); ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); return ret; } return 0; } static void ip_tunnel_encap_del_fou_ops(void) { ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); } #else static int ip_tunnel_encap_add_fou_ops(void) { return 0; } static void ip_tunnel_encap_del_fou_ops(void) { } #endif static __net_init int fou_init_net(struct net *net) { struct fou_net *fn = net_generic(net, fou_net_id); INIT_LIST_HEAD(&fn->fou_list); mutex_init(&fn->fou_lock); return 0; } static __net_exit void fou_exit_net(struct net *net) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fou, *next; /* Close all the FOU sockets */ mutex_lock(&fn->fou_lock); list_for_each_entry_safe(fou, next, &fn->fou_list, list) fou_release(fou); mutex_unlock(&fn->fou_lock); } static struct pernet_operations fou_net_ops = { .init = fou_init_net, .exit = fou_exit_net, .id = &fou_net_id, .size = sizeof(struct fou_net), }; static int __init fou_init(void) { int ret; ret = register_pernet_device(&fou_net_ops); if (ret) goto exit; ret = genl_register_family(&fou_nl_family); if (ret < 0) goto unregister; ret = register_fou_bpf(); if (ret < 0) goto kfunc_failed; ret = ip_tunnel_encap_add_fou_ops(); if (ret == 0) return 0; kfunc_failed: genl_unregister_family(&fou_nl_family); unregister: unregister_pernet_device(&fou_net_ops); exit: return ret; } static void __exit fou_fini(void) { ip_tunnel_encap_del_fou_ops(); genl_unregister_family(&fou_nl_family); unregister_pernet_device(&fou_net_ops); } module_init(fou_init); module_exit(fou_fini); MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Foo over UDP");
240 307 307 306 306 305 307 2 2 2 2 2 307 307 307 304 2 2 1 1 307 2 2 2 2 2 2 1 1 1 1 2 1 2 219 195 196 374 65 68 64 56 68 51 52 51 43 51 8 7 1 1 1 1 1 1 1 1 1 1 198 198 198 198 14 7 198 197 198 51 44 51 197 90 88 90 90 90 1 1 90 89 1 89 3 3 2 2 2 22 8 8 8 22 3 21 1 22 2 22 31 31 31 35 32 61 125 8 8 1 1 1 1 1 1 1 1 1 1 1 1 1 37 36 36 30 36 38 30 30 30 29 498 10 491 39 142 84 144 141 135 144 197 198 90 188 84 1 317 316 308 303 307 306 307 305 268 269 198 145 36 36 181 84 85 182 269 270 18 18 17 18 2 18 18 17 29 27 29 26 18 18 18 17 18 7 3 6 6 6 6 4 25 25 25 5 28 27 28 26 18 18 17 8 8 7 6 7 3 3 3 3 3 3 2 3 5 4 5 2 2 1 1 1 1 1 1 1 1 1 1 1 30 16 29 6 29 5 29 1 30 30 6 22 6 6 6 6 22 34 34 34 37 35 4 2 3 35 5 33 37 3 1 34 34 35 3 34 35 18 34 35 32 17 17 33 31 4 33 34 33 34 34 34 34 33 30 29 30 29 30 29 30 29 29 28 29 17 16 29 26 27 33 29 26 28 28 33 34 35 39 36 38 105 104 35 35 105 105 3 102 40 41 87 87 1 1 1 1 1 116 115 110 108 107 95 93 95 14 13 11 10 102 14 11 10 11 7 7 7 6 6 6 6 4 3 4 1 1 1 1 1 1 1 1 6 5 2 6 1 1 1 1 1 1 1 1 3 1 1 1 1 3 7 6 7 4 1 1 1 1 1 5 4 5 2 1 1 1 1 1 16 15 15 14 14 13 10 8 11 2 1 2 1 3 3 1 1 2 2 2 2 2 4 14 16 95 95 95 95 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module for IP set management */ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/ipset/ip_set.h> static LIST_HEAD(ip_set_type_list); /* all registered set types */ static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ bool is_deleted; /* deleted by ip_set_net_exit */ bool is_destroyed; /* all sets are destroyed */ }; static unsigned int ip_set_net_id __read_mostly; static struct ip_set_net *ip_set_pernet(struct net *net) { return net_generic(net, ip_set_net_id); } #define IP_SET_INC 64 #define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ #define ip_set_dereference(inst) \ rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ lockdep_is_held(&ip_set_ref_lock) || \ (inst)->is_deleted) #define ip_set(inst, id) \ ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] #define ip_set_dereference_nfnl(p) \ rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ static void ip_set_type_lock(void) { mutex_lock(&ip_set_type_mutex); } static void ip_set_type_unlock(void) { mutex_unlock(&ip_set_type_mutex); } /* Register and deregister settype */ static struct ip_set_type * find_set_type(const char *name, u8 family, u8 revision) { struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list, lockdep_is_held(&ip_set_type_mutex)) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && revision <= type->revision_max) return type; return NULL; } /* Unlock, try to load a set type module and lock again */ static bool load_settype(const char *name) { if (!try_module_get(THIS_MODULE)) return false; nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); module_put(THIS_MODULE); return false; } nfnl_lock(NFNL_SUBSYS_IPSET); module_put(THIS_MODULE); return true; } /* Find a set type and reference it */ #define find_set_type_get(name, family, revision, found) \ __find_set_type_get(name, family, revision, found, false) static int __find_set_type_get(const char *name, u8 family, u8 revision, struct ip_set_type **found, bool retry) { struct ip_set_type *type; int err; if (retry && !load_settype(name)) return -IPSET_ERR_FIND_TYPE; rcu_read_lock(); *found = find_set_type(name, family, revision); if (*found) { err = !try_module_get((*found)->me) ? -EFAULT : 0; goto unlock; } /* Make sure the type is already loaded * but we don't support the revision */ list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } rcu_read_unlock(); return retry ? -IPSET_ERR_FIND_TYPE : __find_set_type_get(name, family, revision, found, true); unlock: rcu_read_unlock(); return err; } /* Find a given set type by name and family. * If we succeeded, the supported minimal and maximum revisions are * filled out. */ #define find_set_type_minmax(name, family, min, max) \ __find_set_type_minmax(name, family, min, max, false) static int __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, bool retry) { struct ip_set_type *type; bool found = false; if (retry && !load_settype(name)) return -IPSET_ERR_FIND_TYPE; *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; if (type->revision_min < *min) *min = type->revision_min; if (type->revision_max > *max) *max = type->revision_max; } rcu_read_unlock(); if (found) return 0; return retry ? -IPSET_ERR_FIND_TYPE : __find_set_type_minmax(name, family, min, max, true); } #define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \ (f) == NFPROTO_IPV6 ? "inet6" : "any") /* Register a set type structure. The type is identified by * the unique triple of name, family and revision. */ int ip_set_type_register(struct ip_set_type *type) { int ret = 0; if (type->protocol != IPSET_PROTOCOL) { pr_warn("ip_set type %s, family %s, revision %u:%u uses wrong protocol version %u (want %u)\n", type->name, family_name(type->family), type->revision_min, type->revision_max, type->protocol, IPSET_PROTOCOL); return -EINVAL; } ip_set_type_lock(); if (find_set_type(type->name, type->family, type->revision_min)) { /* Duplicate! */ pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); ip_set_type_unlock(); return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); /* Unregister a set type. There's a small race with ip_set_create */ void ip_set_type_unregister(struct ip_set_type *type) { ip_set_type_lock(); if (!find_set_type(type->name, type->family, type->revision_min)) { pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); ip_set_type_unlock(); synchronize_rcu(); } EXPORT_SYMBOL_GPL(ip_set_type_unregister); /* Utility functions */ void * ip_set_alloc(size_t size) { return kvzalloc(size, GFP_KERNEL_ACCOUNT); } EXPORT_SYMBOL_GPL(ip_set_alloc); void ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); static bool flag_nested(const struct nlattr *nla) { return nla->nla_type & NLA_F_NESTED; } static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), }; int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); static u32 ip_set_timeout_get(const unsigned long *timeout) { u32 t; if (*timeout == IPSET_ELEM_PERMANENT) return 0; t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC; /* Zero value in userspace means no timeout */ return t == 0 ? 1 : t; } static char * ip_set_comment_uget(struct nlattr *tb) { return nla_data(tb); } /* Called from uadd only, protected by the set spinlock. * The kadt functions don't use the comment extensions in any way. */ void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext) { struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); size_t len = ext->comment ? strlen(ext->comment) : 0; if (unlikely(c)) { set->ext_size -= sizeof(*c) + strlen(c->str) + 1; kfree_rcu(c, rcu); rcu_assign_pointer(comment->c, NULL); } if (!len) return; if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) len = IPSET_MAX_COMMENT_SIZE; c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); if (unlikely(!c)) return; strscpy(c->str, ext->comment, len + 1); set->ext_size += sizeof(*c) + strlen(c->str) + 1; rcu_assign_pointer(comment->c, c); } EXPORT_SYMBOL_GPL(ip_set_init_comment); /* Used only when dumping a set, protected by rcu_read_lock() */ static int ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) { struct ip_set_comment_rcu *c = rcu_dereference(comment->c); if (!c) return 0; return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); } /* Called from uadd/udel, flush or the garbage collectors protected * by the set spinlock. * Called when the set is destroyed and when there can't be any user * of the set data anymore. */ static void ip_set_comment_free(struct ip_set *set, void *ptr) { struct ip_set_comment *comment = ptr; struct ip_set_comment_rcu *c; c = rcu_dereference_protected(comment->c, 1); if (unlikely(!c)) return; set->ext_size -= sizeof(*c) + strlen(c->str) + 1; kfree_rcu(c, rcu); rcu_assign_pointer(comment->c, NULL); } typedef void (*destroyer)(struct ip_set *, void *); /* ipset data extension types, in size order */ const struct ip_set_ext_type ip_set_extensions[] = { [IPSET_EXT_ID_COUNTER] = { .type = IPSET_EXT_COUNTER, .flag = IPSET_FLAG_WITH_COUNTERS, .len = sizeof(struct ip_set_counter), .align = __alignof__(struct ip_set_counter), }, [IPSET_EXT_ID_TIMEOUT] = { .type = IPSET_EXT_TIMEOUT, .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, [IPSET_EXT_ID_SKBINFO] = { .type = IPSET_EXT_SKBINFO, .flag = IPSET_FLAG_WITH_SKBINFO, .len = sizeof(struct ip_set_skbinfo), .align = __alignof__(struct ip_set_skbinfo), }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, .len = sizeof(struct ip_set_comment), .align = __alignof__(struct ip_set_comment), .destroy = ip_set_comment_free, }, }; EXPORT_SYMBOL_GPL(ip_set_extensions); static bool add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) { return ip_set_extensions[id].flag ? (flags & ip_set_extensions[id].flag) : !!tb[IPSET_ATTR_TIMEOUT]; } size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, size_t align) { enum ip_set_ext_id id; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; if (!align) align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; if (align < ip_set_extensions[id].align) align = ip_set_extensions[id].align; len = ALIGN(len, ip_set_extensions[id].align); set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; len += ip_set_extensions[id].len; } return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_TIMEOUT]) { if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( tb[IPSET_ATTR_BYTES])); if (tb[IPSET_ATTR_PACKETS]) ext->packets = be64_to_cpu(nla_get_be64( tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbinfo.skbmark = fullmark >> 32; ext->skbinfo.skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbinfo.skbprio = be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbinfo.skbqueue = be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE])); } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); static u64 ip_set_get_bytes(const struct ip_set_counter *counter) { return (u64)atomic64_read(&(counter)->bytes); } static u64 ip_set_get_packets(const struct ip_set_counter *counter) { return (u64)atomic64_read(&(counter)->packets); } static bool ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) { return nla_put_net64(skb, IPSET_ATTR_BYTES, cpu_to_be64(ip_set_get_bytes(counter)), IPSET_ATTR_PAD) || nla_put_net64(skb, IPSET_ATTR_PACKETS, cpu_to_be64(ip_set_get_packets(counter)), IPSET_ATTR_PAD); } static bool ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) { /* Send nonzero parameters only */ return ((skbinfo->skbmark || skbinfo->skbmarkmask) && nla_put_net64(skb, IPSET_ATTR_SKBMARK, cpu_to_be64((u64)skbinfo->skbmark << 32 | skbinfo->skbmarkmask), IPSET_ATTR_PAD)) || (skbinfo->skbprio && nla_put_net32(skb, IPSET_ATTR_SKBPRIO, cpu_to_be32(skbinfo->skbprio))) || (skbinfo->skbqueue && nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, cpu_to_be16(skbinfo->skbqueue))); } int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active) { if (SET_WITH_TIMEOUT(set)) { unsigned long *timeout = ext_timeout(e, set); if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(active ? ip_set_timeout_get(timeout) : *timeout))) return -EMSGSIZE; } if (SET_WITH_COUNTER(set) && ip_set_put_counter(skb, ext_counter(e, set))) return -EMSGSIZE; if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; if (SET_WITH_SKBINFO(set) && ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) return -EMSGSIZE; return 0; } EXPORT_SYMBOL_GPL(ip_set_put_extensions); static bool ip_set_match_counter(u64 counter, u64 match, u8 op) { switch (op) { case IPSET_COUNTER_NONE: return true; case IPSET_COUNTER_EQ: return counter == match; case IPSET_COUNTER_NE: return counter != match; case IPSET_COUNTER_LT: return counter < match; case IPSET_COUNTER_GT: return counter > match; } return false; } static void ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) { atomic64_add((long long)bytes, &(counter)->bytes); } static void ip_set_add_packets(u64 packets, struct ip_set_counter *counter) { atomic64_add((long long)packets, &(counter)->packets); } static void ip_set_update_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext, u32 flags) { if (ext->packets != ULLONG_MAX && !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { ip_set_add_bytes(ext->bytes, counter); ip_set_add_packets(ext->packets, counter); } } static void ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { mext->skbinfo = *skbinfo; } bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) return false; if (SET_WITH_COUNTER(set)) { struct ip_set_counter *counter = ext_counter(data, set); ip_set_update_counter(counter, ext, flags); if (flags & IPSET_FLAG_MATCH_COUNTERS && !(ip_set_match_counter(ip_set_get_packets(counter), mext->packets, mext->packets_op) && ip_set_match_counter(ip_set_get_bytes(counter), mext->bytes, mext->bytes_op))) return false; } if (SET_WITH_SKBINFO(set)) ip_set_get_skbinfo(ext_skbinfo(data, set), ext, mext, flags); return true; } EXPORT_SYMBOL_GPL(ip_set_match_extensions); /* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * * Sets are identified by their index in ip_set_list and the index * is used by the external references (set/SET netfilter modules). * * The set behind an index may change by swapping only, from userspace. */ static void __ip_set_get(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); set->ref++; write_unlock_bh(&ip_set_ref_lock); } static void __ip_set_put(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); BUG_ON(set->ref == 0); set->ref--; write_unlock_bh(&ip_set_ref_lock); } /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ static void __ip_set_get_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); set->ref_netlink++; write_unlock_bh(&ip_set_ref_lock); } static void __ip_set_put_netlink(struct ip_set *set) { write_lock_bh(&ip_set_ref_lock); BUG_ON(set->ref_netlink == 0); set->ref_netlink--; write_unlock_bh(&ip_set_ref_lock); } /* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. */ static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { struct ip_set_net *inst = ip_set_pernet(net); /* ip_set_list and the set pointer need to be protected */ return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void ip_set_lock(struct ip_set *set) { if (!set->variant->region_lock) spin_lock_bh(&set->lock); } static inline void ip_set_unlock(struct ip_set *set) { if (!set->variant->region_lock) spin_unlock_bh(&set->lock); } int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret = 0; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) && (set->type->features & IPSET_TYPE_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY)) ret = -ret; } /* Convert error codes to nomatch */ return (ret < 0 ? 0 : ret); } EXPORT_SYMBOL_GPL(ip_set_test); int ip_set_add(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); return ret; } EXPORT_SYMBOL_GPL(ip_set_add); int ip_set_del(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) { struct ip_set *set = ip_set_rcu_get(xt_net(par), index); int ret = 0; BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); ip_set_unlock(set); return ret; } EXPORT_SYMBOL_GPL(ip_set_del); /* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) { ip_set_id_t i, index = IPSET_INVALID_ID; struct ip_set *s; struct ip_set_net *inst = ip_set_pernet(net); rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; break; } } rcu_read_unlock(); return index; } EXPORT_SYMBOL_GPL(ip_set_get_byname); /* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * */ static void __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) { struct ip_set *set; rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; if (set) __ip_set_put(set); rcu_read_unlock(); } void ip_set_put_byindex(struct net *net, ip_set_id_t index) { struct ip_set_net *inst = ip_set_pernet(net); __ip_set_put_byindex(inst, index); } EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. * Set itself is protected by RCU, but its name isn't: to protect against * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the * name. */ void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); /* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ /* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. */ ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); if (index >= inst->ip_set_max) return IPSET_INVALID_ID; nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, index); if (set) __ip_set_get(set); else index = IPSET_INVALID_ID; nfnl_unlock(NFNL_SUBSYS_IPSET); return index; } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); /* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * * The nfnl mutex is used in the function. */ void ip_set_nfnl_put(struct net *net, ip_set_id_t index) { struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); /* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ static inline u8 protocol(const struct nlattr * const tb[]) { return nla_get_u8(tb[IPSET_ATTR_PROTOCOL]); } static inline bool protocol_failed(const struct nlattr * const tb[]) { return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) != IPSET_PROTOCOL; } static inline bool protocol_min_failed(const struct nlattr * const tb[]) { return !tb[IPSET_ATTR_PROTOCOL] || protocol(tb) < IPSET_PROTOCOL_MIN; } static inline u32 flag_exist(const struct nlmsghdr *nlh) { return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; } static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { return nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1}, [IPSET_ATTR_REVISION] = { .type = NLA_U8 }, [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, }; static struct ip_set * find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) { struct ip_set *set = NULL; ip_set_id_t i; *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set && STRNCMP(set->name, name)) { *id = i; break; } } return (*id == IPSET_INVALID_ID ? NULL : set); } static inline struct ip_set * find_set(struct ip_set_net *inst, const char *name) { ip_set_id_t id; return find_set_and_id(inst, name, &id); } static int find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, struct ip_set **set) { struct ip_set *s; ip_set_id_t i; *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (!s) { if (*index == IPSET_INVALID_ID) *index = i; } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; } } if (*index == IPSET_INVALID_ID) /* No free slot remained */ return -IPSET_ERR_MAX_SETS; return 0; } static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return -EOPNOTSUPP; } static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(info->nlh); int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_REVISION] || !attr[IPSET_ATTR_FAMILY] || (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; name = nla_data(attr[IPSET_ATTR_SETNAME]); typename = nla_data(attr[IPSET_ATTR_TYPENAME]); family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); /* First, and without any locks, allocate and initialize * a normal base set structure. */ set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; spin_lock_init(&set->lock); strscpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } /* Set create flags depending on the type revision */ set->flags |= set->type->create_flags[revision]; ret = set->type->create(info->net, set, tb, flags); if (ret != 0) goto put_out; /* BTW, ret==0 here. */ /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ ret = find_free_id(inst, set->name, &index, &clash); if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && set->variant->same_set(set, clash)) ret = 0; goto cleanup; } else if (ret == -IPSET_ERR_MAX_SETS) { struct ip_set **list, **tmp; ip_set_id_t i = inst->ip_set_max + IP_SET_INC; if (i < inst->ip_set_max || i == IPSET_INVALID_ID) /* Wraparound */ goto cleanup; list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ synchronize_net(); /* Use new list */ index = inst->ip_set_max; inst->ip_set_max = i; kvfree(tmp); ret = 0; } else if (ret) { goto cleanup; } /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; return ret; cleanup: set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); out: kfree(set); return ret; } /* Destroy sets */ static const struct nla_policy ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, }; /* In order to return quickly when destroying a single set, it is split * into two stages: * - Cancel garbage collector * - Destroy the set itself via call_rcu() */ static void ip_set_destroy_set_rcu(struct rcu_head *head) { struct ip_set *set = container_of(head, struct ip_set, rcu); set->variant->destroy(set); module_put(set->type->me); kfree(set); } static void _destroy_all_sets(struct ip_set_net *inst) { struct ip_set *set; ip_set_id_t i; bool need_wait = false; /* First cancel gc's: set:list sets are flushed as well */ for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set) { set->variant->cancel_gc(set); if (set->type->features & IPSET_TYPE_NAME) need_wait = true; } } /* Must wait for flush to be really finished */ if (need_wait) rcu_barrier(); for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); if (set) { ip_set(inst, i) = NULL; set->variant->destroy(set); module_put(set->type->me); kfree(set); } } } static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; int ret = 0; if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference * counter, so if it's already zero, we can proceed * without holding the lock. */ if (!attr[IPSET_ATTR_SETNAME]) { read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { ret = -IPSET_ERR_BUSY; goto out; } } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { u32 flags = flag_exist(info->nlh); u16 features = 0; read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { if (!(flags & IPSET_FLAG_EXIST)) ret = -ENOENT; goto out; } else if (s->ref || s->ref_netlink) { ret = -IPSET_ERR_BUSY; goto out; } features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); /* Must cancel garbage collectors */ s->variant->cancel_gc(s); if (features & IPSET_TYPE_NAME) { /* Must wait for flush to be really finished */ rcu_barrier(); } call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: read_unlock_bh(&ip_set_ref_lock); return ret; } /* Flush sets */ static void ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); ip_set_lock(set); set->variant->flush(set); ip_set_unlock(set); } static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!s) return -ENOENT; ip_set_flush_set(s); } return 0; } /* Rename a set */ static const struct nla_policy ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, }; static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; write_lock_bh(&ip_set_ref_lock); if (set->ref != 0 || set->ref_netlink != 0) { ret = -IPSET_ERR_REFERENCED; goto out; } name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } } strscpy_pad(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); return ret; } /* Swap two sets so that name/index points to the other. * References and set names are also swapped. * * The commands are serialized by the nfnl mutex and references are * protected by the ip_set_ref_lock. The kernel interfaces * do not hold the mutex but the pointer settings are atomic * so the ip_set_list always contains valid pointers to the sets. */ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. * Not an artifical restriction anymore, as we must prevent * possible loops created by swapping in setlist type of sets. */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; write_lock_bh(&ip_set_ref_lock); if (from->ref_netlink || to->ref_netlink) { write_unlock_bh(&ip_set_ref_lock); return -EBUSY; } strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN); strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN); strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); return 0; } /* List/save set data */ #define DUMP_INIT 0 #define DUMP_ALL 1 #define DUMP_ONE 2 #define DUMP_LAST 3 #define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) #define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) { u32 cadt_flags = 0; if (SET_WITH_TIMEOUT(set)) if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(set->timeout)))) return -EMSGSIZE; if (SET_WITH_COUNTER(set)) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; if (SET_WITH_SKBINFO(set)) cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; if (!cadt_flags) return 0; return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); } EXPORT_SYMBOL_GPL(ip_set_put_flags); static int ip_set_dump_done(struct netlink_callback *cb) { if (cb->args[IPSET_CB_ARG0]) { struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; struct ip_set *set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); __ip_set_put_netlink(set); } return 0; } static inline void dump_attrs(struct nlmsghdr *nlh) { const struct nlattr *attr; int rem; pr_debug("dump nlmsg\n"); nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); } } static const struct nla_policy ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_FLAGS] = { .type = NLA_U32 }, }; static int ip_set_dump_start(struct netlink_callback *cb) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; struct sk_buff *skb = cb->skb; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type; int ret; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_dump_policy, NULL); if (ret) goto error; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { ip_set_id_t index; struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); if (!set) { ret = -ENOENT; goto error; } dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; } else { dump_type = DUMP_ALL; } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; cb->args[IPSET_CB_DUMP] = dump_type; return 0; error: /* We have to create and send the error message manually :-( */ if (nlh->nlmsg_flags & NLM_F_ACK) { netlink_ack(cb->skb, nlh, ret, NULL); } return ret; } static int ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) { ip_set_id_t index = IPSET_INVALID_ID, max; struct ip_set *set = NULL; struct nlmsghdr *nlh = NULL; unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) return -EINVAL; if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max) goto out; dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]); dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]); max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1 : inst->ip_set_max; dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); is_destroyed = inst->is_destroyed; if (!set || is_destroyed) { write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } if (is_destroyed) { /* All sets are just being destroyed */ ret = 0; goto out; } continue; } /* When dumping all sets, we must dump "sorted" * so that lists (unions of sets) are dumped last. */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == !!(set->type->features & IPSET_DUMP_LAST))) { write_unlock_bh(&ip_set_ref_lock); continue; } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); set->ref_netlink++; } write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); if (!nlh) { ret = -EMSGSIZE; goto release_refcount; } if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, cb->args[IPSET_CB_PROTO]) || nla_put_string(skb, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; if (dump_flags & IPSET_FLAG_LIST_SETNAME) goto next_set; switch (cb->args[IPSET_CB_ARG0]) { case 0: /* Core header data */ if (nla_put_string(skb, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb, IPSET_ATTR_FAMILY, set->family) || nla_put_u8(skb, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) goto nla_put_failure; ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; if (set->variant->uref) set->variant->uref(set, cb, true); fallthrough; default: ret = set->variant->list(set, skb, cb); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; goto release_refcount; } } /* If we dump all sets, continue with dumping last ones */ if (dump_type == DUMP_ALL) { dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; if (set && set->variant->uref) set->variant->uref(set, cb, false); goto dump_last; } goto out; nla_put_failure: ret = -EFAULT; next_set: if (dump_type == DUMP_ONE) cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID; else cb->args[IPSET_CB_INDEX]++; release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { set = ip_set_ref_netlink(inst, index); if (set->variant->uref) set->variant->uref(set, cb, false); pr_debug("release set %s\n", set->name); __ip_set_put_netlink(set); cb->args[IPSET_CB_ARG0] = 0; } out: if (nlh) { nlmsg_end(skb, nlh); pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); dump_attrs(nlh); } return ret < 0 ? ret : skb->len; } static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; { struct netlink_dump_control c = { .start = ip_set_dump_start, .dump = ip_set_dump_do, .done = ip_set_dump_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } } /* Add, del and test */ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, [IPSET_ATTR_ADT] = { .type = NLA_NESTED }, }; static int call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; u32 lineno = 0; bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { if (retried) { __ip_set_get_netlink(set); nfnl_unlock(NFNL_SUBSYS_IPSET); cond_resched(); nfnl_lock(NFNL_SUBSYS_IPSET); __ip_set_put_netlink(set); } ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); ip_set_unlock(set); retried = true; } while (ret == -ERANGE || (ret == -EAGAIN && set->variant->resize && (ret = set->variant->resize(set, retried)) == 0)); if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) return 0; if (lineno && use_lineno) { /* Error in restore/batch mode: send back lineno */ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; struct nlmsgerr *errmsg; size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); if (!skb2) return -ENOMEM; rep = nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = ret; unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len, /* Bounds checked by the skb layer. */); cmdattr = (void *)&errmsg->msg + min_len; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, nlh->nlmsg_len - min_len, ip_set_adt_policy, NULL); if (ret) { nlmsg_free(skb2); return ret; } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } return ret; } static int ip_set_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, enum ipset_adt adt, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; } } return ret; } static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return ip_set_ad(info->net, info->sk, skb, IPSET_ADD, info->nlh, attr, info->extack); } static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { return ip_set_ad(info->net, info->sk, skb, IPSET_DEL, info->nlh, attr, info->extack); } static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; u32 lineno; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME] || !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0); rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; return ret > 0 ? 0 : -IPSET_ERR_EXIST; } /* Get headed data of a set */ static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get type data */ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, .len = IPSET_MAXNAMELEN - 1 }, [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; u8 family, min, max; const char *typename; int ret = 0; if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_TYPENAME] || !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); typename = nla_data(attr[IPSET_ATTR_TYPENAME]); ret = find_set_type_minmax(typename, family, &min, &max); if (ret) return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) || nla_put_u8(skb2, IPSET_ATTR_REVISION, max) || nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min)) goto nla_put_failure; nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get protocol version */ static const struct nla_policy ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL)) goto nla_put_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL_MIN, IPSET_PROTOCOL_MIN)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } /* Get set by name or index, from userspace */ static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &id); if (id == IPSET_INVALID_ID) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYNAME); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) || nla_put_net16(skb2, IPSET_ATTR_INDEX, htons(id))) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, }; static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) return -IPSET_ERR_PROTOCOL; id = ip_set_get_h16(attr[IPSET_ATTR_INDEX]); if (id >= inst->ip_set_max) return -ENOENT; set = ip_set(inst, id); if (set == NULL) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYINDEX); if (!nlh2) goto nlmsg_failure; if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, protocol(attr)) || nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name)) goto nla_put_failure; nlmsg_end(skb2, nlh2); return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); nla_put_failure: nlmsg_cancel(skb2, nlh2); nlmsg_failure: kfree_skb(skb2); return -EMSGSIZE; } static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_NONE] = { .call = ip_set_none, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, }, [IPSET_CMD_CREATE] = { .call = ip_set_create, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_create_policy, }, [IPSET_CMD_DESTROY] = { .call = ip_set_destroy, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_FLUSH] = { .call = ip_set_flush, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_RENAME] = { .call = ip_set_rename, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_SWAP] = { .call = ip_set_swap, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_LIST] = { .call = ip_set_dump, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_ADD] = { .call = ip_set_uadd, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_DEL] = { .call = ip_set_udel, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_TEST] = { .call = ip_set_utest, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_HEADER] = { .call = ip_set_header, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_TYPE] = { .call = ip_set_type, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_type_policy, }, [IPSET_CMD_PROTOCOL] = { .call = ip_set_protocol, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, [IPSET_CMD_GET_BYNAME] = { .call = ip_set_byname, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_GET_BYINDEX] = { .call = ip_set_byindex, .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_index_policy, }, }; static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { .name = "ip_set", .subsys_id = NFNL_SUBSYS_IPSET, .cb_count = IPSET_MSG_MAX, .cb = ip_set_netlink_subsys_cb, }; /* Interface to iptables/ip6tables */ static int ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) { unsigned int *op; void *data; int copylen = *len, ret = 0; struct net *net = sock_net(sk); struct ip_set_net *inst = ip_set_pernet(net); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optval != SO_IP_SET) return -EBADF; if (*len < sizeof(unsigned int)) return -EINVAL; data = vmalloc(*len); if (!data) return -ENOMEM; if (copy_from_user(data, user, *len) != 0) { ret = -EFAULT; goto done; } op = data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ struct ip_set_req_version *req_version = data; if (*len < sizeof(struct ip_set_req_version)) { ret = -EINVAL; goto done; } if (req_version->version < IPSET_PROTOCOL_MIN) { ret = -EPROTO; goto done; } } switch (*op) { case IP_SET_OP_VERSION: { struct ip_set_req_version *req_version = data; if (*len != sizeof(struct ip_set_req_version)) { ret = -EINVAL; goto done; } req_version->version = IPSET_PROTOCOL; if (copy_to_user(user, req_version, sizeof(struct ip_set_req_version))) ret = -EFAULT; goto done; } case IP_SET_OP_GET_BYNAME: { struct ip_set_req_get_set *req_get = data; ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_FNAME: { struct ip_set_req_get_set_family *req_get = data; ip_set_id_t id; if (*len != sizeof(struct ip_set_req_get_set_family)) { ret = -EINVAL; goto done; } req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; nfnl_lock(NFNL_SUBSYS_IPSET); find_set_and_id(inst, req_get->set.name, &id); req_get->set.index = id; if (id != IPSET_INVALID_ID) req_get->family = ip_set(inst, id)->family; nfnl_unlock(NFNL_SUBSYS_IPSET); goto copy; } case IP_SET_OP_GET_BYINDEX: { struct ip_set_req_get_set *req_get = data; struct ip_set *set; if (*len != sizeof(struct ip_set_req_get_set) || req_get->set.index >= inst->ip_set_max) { ret = -EINVAL; goto done; } nfnl_lock(NFNL_SUBSYS_IPSET); set = ip_set(inst, req_get->set.index); ret = strscpy(req_get->set.name, set ? set->name : "", IPSET_MAXNAMELEN); nfnl_unlock(NFNL_SUBSYS_IPSET); if (ret < 0) goto done; goto copy; } default: ret = -EBADMSG; goto done; } /* end of switch(op) */ copy: if (copy_to_user(user, data, copylen)) ret = -EFAULT; done: vfree(data); if (ret > 0) ret = 0; return ret; } static struct nf_sockopt_ops so_set __read_mostly = { .pf = PF_INET, .get_optmin = SO_IP_SET, .get_optmax = SO_IP_SET + 1, .get = ip_set_sockfn_get, .owner = THIS_MODULE, }; static int __net_init ip_set_net_init(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set **list; inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX; if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; inst->is_deleted = false; inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } static void __net_exit ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); inst->is_deleted = true; /* flag for ip_set_nfnl_put */ } static void __net_exit ip_set_net_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), }; static int __init ip_set_init(void) { int ret = register_pernet_subsys(&ip_set_net_ops); if (ret) { pr_err("ip_set: cannot register pernet_subsys.\n"); return ret; } ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); unregister_pernet_subsys(&ip_set_net_ops); return ret; } ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); unregister_pernet_subsys(&ip_set_net_ops); return ret; } return 0; } static void __exit ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); unregister_pernet_subsys(&ip_set_net_ops); /* Wait for call_rcu() in destroy */ rcu_barrier(); pr_debug("these are the famous last words\n"); } module_init(ip_set_init); module_exit(ip_set_fini); MODULE_DESCRIPTION("ip_set: protocol " __stringify(IPSET_PROTOCOL));
31 127 88 88 87 7 40 27 12 17 13 12 12 27 6 7 27 6 27 27 26 27 15 95 94 11 12 12 113 37 111 13 107 217 215 140 35 33 35 34 53 53 53 53 53 53 141 142 20 20 14 20 13 209 208 134 82 52 18 2665 2664 35 35 35 35 18 4 16 34 18 18 35 35 35 2125 1 2663 681 31 2649 2127 2126 2137 20 2655 2130 682 682 683 35 33 49 49 18 18 18 18 18 18 18 18 18 18 1 18 20 14 2 15 15 15 15 15 14 13 15 15 35 35 34 35 21 21 32 32 32 32 32 32 32 32 32 32 32 32 31 35 34 34 35 14 15 32 32 32 31 32 32 32 32 32 32 32 32 32 32 4 4 4 16 77 19 74 79 44 37 19 18 17 41 41 41 41 41 4 4 1 76 76 21 98 88 88 88 88 87 86 98 88 88 88 88 93 92 93 93 142 86 61 141 115 141 85 64 142 140 20 124 142 19 19 19 14 14 19 24 19 13 13 12 12 1 19 17 88 93 79 79 23 23 23 56 88 80 88 93 41 41 41 41 41 41 41 40 6 4 1 6 1 1 1 1 1 34 74 75 75 74 75 75 75 2 2 79 78 79 39 39 38 38 38 39 39 39 67 34 33 34 64 63 64 64 64 64 64 63 19 4 13 31 39 33 33 33 33 32 29 30 30 30 29 30 27 28 28 27 28 27 28 19 28 28 37 30 21 37 30 30 30 30 30 30 38 30 28 36 30 38 30 30 37 14 14 14 14 1 38 38 31 29 38 31 31 38 34 34 33 34 33 33 34 34 32 34 34 38 38 34 34 34 2 3 3 31 31 5 31 30 37 22 24 13 24 13 24 24 13 24 23 24 24 25 23 25 23 22 6 17 22 14 22 22 22 22 22 16 16 7 22 14 1 22 22 3 2 2 2 1 1 2 15 15 15 15 15 13 11 15 15 14 11 11 11 15 15 47 47 47 47 45 47 47 2 2 2 47 15 15 93 93 93 93 10 93 37 93 93 93 93 93 88 93 93 93 93 15 15 15 15 15 15 6 15 10 3 15 11 12 9 9 2 17 17 17 16 16 15 12 12 5 5 5 17 15 16 16 12 3 11 16 5 1 5 5 14 12 12 12 11 7 14 4 4 4 4 3 3 3 3 3 3 3 3 3 2 2 2 2 2 5 4 4 2 5 2 1 2 3 1 3 2 2 2 4 2 2 2 1 2 4 1 3 2 2 2 4 1 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 1 4 4 4 1 1 1 2 1 2 2 1 2 8 8 8 2 7 7 2 5 3 8 2 2 4 3 5 29 29 29 2 6 6 6 6 6 3 3 3 3 1 1 1 3 3 3 3 3 5 4 3 3 3 3 5 44 44 44 44 44 17 8 17 143 142 142 144 13 13 13 9 8 4 3 2 12 9 3 6 26 23 26 14 95 76 20 19 2 1 88 88 88 88 26 98 98 98 98 98 98 98 98 88 19 98 26 22 22 6 22 27 171 170 75 75 168 171 130 129 129 130 129 44 44 1 1 1 51 49 51 51 51 51 44 50 46 16 16 5 16 41 41 9 9 41 39 25 1 24 24 10 3 10 10 10 41 41 25 25 3 3 39 2 25 25 25 1 1 41 41 25 25 25 23 25 25 23 41 41 25 10 9 1 15 41 41 33 42 26 25 41 42 25 25 25 42 35 19 19 35 5 35 2 3 2 12 11 12 12 9 9 8 3 7 12 10 10 10 11 15 15 15 15 14 14 7 14 13 14 14 14 14 14 14 14 1 13 13 12 13 14 14 10 2 5 88 88 88 88 88 87 88 88 84 88 88 88 88 88 88 88 88 88 88 88 88 88 79 79 79 79 79 79 79 79 1 79 1 79 30 79 79 79 79 1 79 79 79 56 79 80 80 80 80 81 80 80 79 79 79 79 78 79 80 81 19 19 19 4 19 19 19 13 13 13 13 13 13 13 13 13 1 13 13 13 13 13 13 13 13 13 13 13 13 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 983 8 8 8 6 665 13 666 665 666 666 666 666 653 13 12 8 4 2 2 1 1 1 1 12 12 1 12 12 653 654 654 4 4 653 1 1 1 666 12 654 665 4 4 4 4 650 652 1 651 653 650 653 653 653 653 653 652 9 9 9 653 653 2098 2104 2101 1351 254 2095 2097 2107 2099 2108 39 12 12 9 12 8 8 8 8 8 8 4 4 3 3 3 3 3 3 4 1 1 1 1 2931 2926 2931 2931 2919 2925 2931 39 39 1026 1025 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 /* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "cgroup-internal.h" #include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/hashtable.h> #include <linux/idr.h> #include <linux/kthread.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> #include <linux/nstree.h> #include <linux/irq_work.h> #include <net/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added. */ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif struct blocking_notifier_head cgroup_lifetime_notifier = BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); /* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding. */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. * * A cgroup destruction should enqueue work sequentially to: * cgroup_offline_wq: use for css offline work * cgroup_release_wq: use for css release work * cgroup_free_wq: use for free work * * Rationale for using separate workqueues: * The cgroup root free work may depend on completion of other css offline * operations. If all tasks were enqueued to a single workqueue, this could * create a deadlock scenario where: * - Free work waits for other css offline work to complete. * - But other css offline work is queued after free work in the same queue. * * Example deadlock scenario with single workqueue (cgroup_destroy_wq): * 1. umount net_prio * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, * which can never complete as it's behind in the same queue and * workqueue's max_active is 1. */ static struct workqueue_struct *cgroup_offline_wq; static struct workqueue_struct *cgroup_release_wq; static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of cgroup subsystem names */ #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ #define SUBSYS(_x) \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, static struct static_key_true *cgroup_subsys_enabled_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, static struct static_key_true *cgroup_subsys_on_dfl_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu); static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.self.rstat_cpu = &root_rstat_cpu, .cgrp.rstat_base_cpu = &root_rstat_base_cpu, }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); /* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex. */ static u64 css_serial_nr_next = 1; /* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); /* * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, * read protected by either. * * Can only be turned on, but not turned off. */ bool cgroup_enable_per_threadgroup_rwsem __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { #ifdef CONFIG_PSI OPT_FEATURE_PRESSURE, #endif OPT_FEATURE_COUNT }; static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { #ifdef CONFIG_PSI "pressure", #endif }; static u16 cgroup_feature_disable_mask __read_mostly; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline #define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); #include <linux/cgroup_refcnt.h> #endif /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ bool cgroup_ssid_enabled(int ssid) { if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) { int ret; idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); spin_unlock_bh(&cgroup_idr_lock); } static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } /* can @cgrp host both domain and threaded children? */ static bool cgroup_is_mixable(struct cgroup *cgrp) { /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time. */ return !cgroup_parent(cgrp); } /* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; /* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) return false; /* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) return false; /* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return false; return true; } /* is @cgrp root of a threaded subtree? */ static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) return false; /* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) return true; /* * A domain which has tasks and explicit threaded controllers * enabled is a thread root. */ if (cgroup_has_tasks(cgrp) && (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) return true; return false; } /* a domain which isn't connected to the root w/o brekage can't be used */ static bool cgroup_is_valid_domain(struct cgroup *cgrp) { /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) return false; /* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) return false; if (cgroup_is_threaded(cgrp)) return false; } return true; } /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { u16 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | cgrp_dfl_implicit_ss_mask); return root_ss_mask; } /* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { u16 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } return cgrp->root->subsys_mask; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->self; } /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); if (!ss) return &cgrp->self; /* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask. */ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL; } return cgroup_css(cgrp, ss); } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; do { css = cgroup_css(cgrp, ss); if (css) return css; cgrp = cgroup_parent(cgrp); } while (cgrp); return init_css_set.subsys[ss->id]; } /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put(). */ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; rcu_read_lock(); do { css = cgroup_css(cgrp, ss); if (css && css_tryget_online(css)) goto out_unlock; cgrp = cgroup_parent(cgrp); } while (cgrp); css = init_css_set.subsys[ss->id]; css_get(css); out_unlock: rcu_read_unlock(); return css; } EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); cgroup_get(cgrp); } /** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question */ int __cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += link->cset->nr_tasks; return count; } /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ int cgroup_task_count(const struct cgroup *cgrp) { int count; spin_lock_irq(&css_set_lock); count = __cgroup_task_count(cgrp); spin_unlock_irq(&css_set_lock); return count; } static struct cgroup *kn_priv(struct kernfs_node *kn) { struct kernfs_node *parent; /* * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. * Therefore it is always safe to dereference this pointer outside of a * RCU section. */ parent = rcu_dereference_check(kn->__parent, kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); return parent->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set. */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ (ss) = cgroup_subsys[ssid]; \ { #define while_each_subsys_mask() \ } \ } \ } while (false) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else /* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot. */ .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ static bool css_set_threaded(struct css_set *cset) { return cset->dom_cset != cset; } /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } /** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { struct cgroup *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { bool was_populated = cgroup_is_populated(cgrp); if (!child) { cgrp->nr_populated_csets += adj; } else { if (cgroup_is_threaded(child)) cgrp->nr_populated_threaded_children += adj; else cgrp->nr_populated_domain_children += adj; } if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } /** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); } /* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details. */ static void css_set_skip_task_iters(struct css_set *cset, struct task_struct *task) { struct css_task_iter *it, *pos; list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) css_task_iter_skip(it, task); } /** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); } else { WARN_ON_ONCE(!list_empty(&task->cg_list)); } if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_task_dead()/cgroup_task_free() dropping * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } } /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&css_set_lock); if (!refcount_dec_and_test(&cset->refcount)) return; WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); if (cgroup_parent(link->cgrp)) cgroup_put(link->cgrp); kfree(link); } if (css_set_threaded(cset)) { list_del(&cset->threaded_csets_node); put_css_set_locked(cset->dom_cset); } kfree_rcu(cset, rcu_head); } /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match. */ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; /* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp)) new_dfl_cgrp = new_cgrp; else new_dfl_cgrp = old_cset->dfl_cgrp; if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) return false; /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary. */ l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) { BUG_ON(l2 != &old_cset->cgrp_links); break; } else { BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); cgrp1 = link1->cgrp; cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) return false; } else { if (cgrp1 != cgrp2) return false; } } return true; } /** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want * to change the css. */ template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); hash_for_each_possible(css_set_table, cset, hlist, key) { if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ return cset; } /* No existing cgroup group matched */ return NULL; } static void free_cgrp_cset_links(struct list_head *links_to_free) { struct cgrp_cset_link *link, *tmp_link; list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { list_del(&link->cset_link); kfree(link); } } /** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno. */ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { struct cgrp_cset_link *link; int i; INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; } list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) { struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); if (cgroup_on_dfl(cgrp)) cset->dfl_cgrp = cgrp; link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; /* * Always add links to the tail of the lists so that the lists are * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) cgroup_get_live(cgrp); } /** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsigned long key; int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); spin_unlock_irq(&css_set_lock); if (cset) return cset; cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } refcount_set(&cset->refcount, 1); cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_src_preload_node); INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; link_css_set(&tmp_links, cset, c); } BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cset->subsys[ssid]; list_add_tail(&cset->e_cset_node[ssid], &css->cgroup->e_csets[ssid]); css_get(css); } spin_unlock_irq(&css_set_lock); /* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return. */ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset; dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); if (!dcset) { put_css_set(cset); return NULL; } spin_lock_irq(&css_set_lock); cset->dom_cset = dcset; list_add_tail(&cset->threaded_csets_node, &dcset->threaded_csets); spin_unlock_irq(&css_set_lock); } return cset; } struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; /* * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. * favordynmods can flip while task is between * cgroup_threadgroup_change_begin() and end(), so down_write global * cgroup_threadgroup_rwsem to synchronize them. * * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding * cgroup_threadgroup_rwsem doesn't exlude tasks between * cgroup_thread_group_change_begin() and end() and thus it's unsafe to * turn off. As the scenario is unlikely, simply disallow disabling once * enabled and print out a warning. */ percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { if (cgroup_enable_per_threadgroup_rwsem) pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) { int id; lockdep_assert_held(&cgroup_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; root->hierarchy_id = id; return 0; } static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } void cgroup_free_root(struct cgroup_root *root) { kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; int ret; trace_cgroup_destroy_root(root); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, CGROUP_LIFETIME_OFFLINE, cgrp); WARN_ON_ONCE(notifier_to_errno(ret)); /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } spin_unlock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&root->root_list)); list_del_rcu(&root->root_list); cgroup_root_count--; if (!have_favordynmods) cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); cgroup_unlock(); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* * Returned cgroup is without refcount but it's valid as long as cset pins it. */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res_cgroup = NULL; if (cset == &init_css_set) { res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res_cgroup = c; break; } } } /* * If cgroup_mutex is not held, the cgrp_cset_link will be freed * before we remove the cgroup root from the root_list. Consequently, * when accessing a cgroup root, the cset_link may have already been * freed, resulting in a NULL res_cgroup. However, by holding the * cgroup_mutex, we ensure that res_cgroup can't be NULL. * If we don't hold cgroup_mutex in the caller, we must do the NULL * check. */ return res_cgroup; } /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy */ static struct cgroup * current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; struct css_set *cset; lockdep_assert_held(&css_set_lock); rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; res = __cset_cgroup_from_root(cset, root); rcu_read_unlock(); /* * The namespace_sem is held by current, so the root cgroup can't * be umounted. Therefore, we can ensure that the res is non-NULL. */ WARN_ON_ONCE(!res); return res; } /* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously. */ static struct cgroup *current_cgns_cgroup_dfl(void) { struct css_set *cset; if (current->nsproxy) { cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root); } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_nsproxy_namespaces() * and nsproxy == NULL. Fall back to cgrp_dfl_root which will * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); } /* * Return the cgroup for "task" from the given hierarchy. Must be * called with css_set_lock held to prevent task's groups from being modified. * Must be called with either cgroup_mutex or rcu read lock to prevent the * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } /* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); } else { strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE) mode |= S_IWUGO; else mode |= S_IWUSR; } return mode; } /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; cur_ss_mask = new_ss_mask; } return cur_ss_mask; } /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn_priv(kn); cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); } /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ if (!cgroup_tryget(cgrp)) return NULL; kernfs_break_active_protection(kn); if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; cgroup_kn_unlock(kn); return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); timer_delete_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) return; css->flags &= ~CSS_VISIBLE; if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); if (cgroup_psi_enabled()) cgroup_addrm_files(css, cgrp, cgroup_psi_files, false); } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, false); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } } /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css * * On failure, no file is added. */ static int css_populate_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; if (css->flags & CSS_VISIBLE) return 0; if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); if (ret < 0) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); return ret; } } } else { ret = cgroup_addrm_files(css, cgrp, cgroup1_base_files, true); if (ret < 0) return ret; } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); if (ret < 0) { failed_cfts = cfts; goto err; } } } css->flags |= CSS_VISIBLE; return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { if (cfts == failed_cfts) break; cgroup_addrm_files(css, cgrp, cfts, false); } return ret; } int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; /* * Collect ssid's that need to be disabled from default * hierarchy. */ if (ss->root == &cgrp_dfl_root) dfl_disable_ss_mask |= 1 << ssid; } while_each_subsys_mask(); if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset, *cset_pos; struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { /* disable from the source */ src_root->subsys_mask &= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; spin_lock_irq(&css_set_lock); css->cgroup = dcgrp; WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here. */ list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id]) it->cset_head = &dcgrp->e_csets[ss->id]; } spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\n", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); struct cgroup *ns_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len == -E2BIG) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); len = 0; } kfree(buf); return len; } enum cgroup2_param { Opt_nsdelegate, Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, Opt_memory_hugetlb_accounting, Opt_pids_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; case Opt_memory_hugetlb_accounting: ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; return 0; case Opt_pids_localevents: ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; return 0; } return -EINVAL; } struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; return &ctx->peak; } static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { if (root_flags & CGRP_ROOT_NS_DELEGATE) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; cgroup_favor_dynmods(&cgrp_dfl_root, root_flags & CGRP_ROOT_FAVOR_DYNMODS); if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) seq_puts(seq, ",memory_hugetlb_accounting"); if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) seq_puts(seq, ",pids_localevents"); return 0; } static int cgroup_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); apply_cgroup_root_flags(ctx->flags); return 0; } static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; int ssid; INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); #ifdef CONFIG_CGROUP_BPF for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) cgrp->bpf.revisions[i] = 1; #endif init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_fs_context *ctx) { struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | KERNFS_ROOT_SUPPORT_USER_XATTR | KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats; ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, CGROUP_LIFETIME_ONLINE, root_cgrp); WARN_ON_ONCE(notifier_to_errno(ret)); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); ret = 0; goto out; exit_stats: css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; } int cgroup_do_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; ctx->kfc.root = ctx->root->kf_root; if (fc->fs_type == &cgroup2_fs_type) ctx->kfc.magic = CGROUP2_SUPER_MAGIC; else ctx->kfc.magic = CGROUP_SUPER_MAGIC; ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); if (IS_ERR(nsdentry)) { deactivate_locked_super(sb); ret = PTR_ERR(nsdentry); nsdentry = NULL; } fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) cgroup_put(&ctx->root->cgrp); return ret; } /* * Destroy a cgroup filesystem context. */ static void cgroup_fs_context_free(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); kfree(ctx->name); kfree(ctx->release_agent); put_cgroup_ns(ctx->ns); kernfs_free_fs_context(fc); kfree(ctx); } static int cgroup_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; ret = cgroup_do_get_tree(fc); if (!ret) apply_cgroup_root_flags(ctx->flags); return ret; } static const struct fs_context_operations cgroup_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup2_parse_param, .get_tree = cgroup_get_tree, .reconfigure = cgroup_reconfigure, }; static const struct fs_context_operations cgroup1_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup1_parse_param, .get_tree = cgroup1_get_tree, .reconfigure = cgroup1_reconfigure, }; /* * Initialise the cgroup filesystem creation/reconfiguration context. Notably, * we select the namespace we're going to use. */ static int cgroup_init_fs_context(struct fs_context *fc) { struct cgroup_fs_context *ctx; ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; if (fc->fs_type == &cgroup2_fs_type) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; if (have_favordynmods) ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { .name = "cgroup", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup1_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup2_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; #ifdef CONFIG_CPUSETS_V1 enum cpuset_param { Opt_cpuset_v2_mode, }; static const struct fs_parameter_spec cpuset_fs_parameters[] = { fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), {} }; static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cpuset_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; return 0; } return -EINVAL; } static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, .parse_param = cpuset_parse_param, }; /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ static int cpuset_init_fs_context(struct fs_context *fc) { char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); struct cgroup_fs_context *ctx; int err; err = cgroup_init_fs_context(fc); if (err) { kfree(agent); return err; } fc->ops = &cpuset_fs_context_ops; ctx = cgroup_fc2context(fc); ctx->subsys_mask = 1 << cpuset_cgrp_id; ctx->flags |= CGRP_ROOT_NOPREFIX; ctx->release_agent = agent; get_filesystem(&cgroup_fs_type); put_filesystem(fc->fs_type); fc->fs_type = &cgroup_fs_type; return 0; } static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { int ret; cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() * @lock_mode: whether acquire and acquire which rwsem * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() * implementations (e.g. cpuset), also need to disable CPU hotplug. * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can * lead to deadlocks. * * Bringing up a CPU may involve creating and destroying tasks which requires * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while * write-locking threadgroup_rwsem, the locking order is reversed and we end up * waiting for an on-going CPU hotplug operation which in turn is waiting for * the threadgroup_rwsem to be released to create new tasks. For more details: * * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu * * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. * * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead * on dynamic cgroup modifications. see the comment above * CGRP_ROOT_FAVOR_DYNMODS definition. * * tsk is not NULL only when writing to cgroup.procs. */ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, struct task_struct *tsk) { cpus_read_lock(); switch (lock_mode) { case CGRP_ATTACH_LOCK_NONE: break; case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); break; case CGRP_ATTACH_LOCK_PER_THREADGROUP: down_write(&tsk->signal->cgroup_threadgroup_rwsem); break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_mode: whether release and release which rwsem * @tsk: thread group to lock */ void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, struct task_struct *tsk) { switch (lock_mode) { case CGRP_ATTACH_LOCK_NONE: break; case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); break; case CGRP_ATTACH_LOCK_PER_THREADGROUP: up_write(&tsk->signal->cgroup_threadgroup_rwsem); break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; } cpus_read_unlock(); } /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one. */ static void cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return; /* cgroup_threadgroup_rwsem protects racing against forks */ WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) return; mgctx->tset.nr_tasks++; list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, &mgctx->tset.dst_csets); } /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else task = list_next_entry(task, cg_list); if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; /* * This function may be called both before and * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ if (cset->mg_dst_cset) *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else *dst_cssp = cset->subsys[tset->ssid]; return task; } cset = list_next_entry(cset, mg_node); task = NULL; } return NULL; } /** * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; /* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); if (ret) { failed_ssid = ssid; goto out_cancel_attach; } } } while_each_subsys_mask(); } /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, * the task might require to change its state. */ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, to_cset->dfl_cgrp); put_css_set_locked(from_cset); } } spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete. */ tset->csets = &tset->dst_csets; if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); } } while_each_subsys_mask(); } ret = 0; goto out_release_tset; out_cancel_attach: if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { tset->ssid = ssid; ss->cancel_attach(tset); } } while_each_subsys_mask(); } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); /* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration. */ tset->nr_tasks = 0; tset->csets = &tset->src_csets; return ret; } /** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks. */ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0; /* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; /* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY; return 0; } /** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_src_preload_node); put_css_set_locked(cset); } list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } spin_unlock_irq(&css_set_lock); } /** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); /* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it. */ if (src_cset->dead) return; if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx. */ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); /* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both. */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; } src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_dst_preload_node)) list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); for_each_subsys(ss, ssid) if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) mgctx->ss_mask |= 1 << ssid; } return 0; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; /* * The following thread iteration should be inside an RCU critical * section to prevent tasks from being freed while taking the snapshot. * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); if (!ret) TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); goto out_unlock_rcu; } } else { tsk = current; } if (threadgroup) tsk = tsk->group_leader; /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); goto out_unlock_rcu; } get_task_struct(tsk); rcu_read_unlock(); /* * If we migrate a single thread, we don't care about threadgroup * stability. If the thread is `current`, it won't exit(2) under our * hands or change PID through exec(2). We exclude * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers * by cgroup_mutex. Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); if (pid || threadgroup) { if (cgroup_enable_per_threadgroup_rwsem) *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; else *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; } else { *lock_mode = CGRP_ATTACH_LOCK_NONE; } cgroup_attach_lock(*lock_mode, tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { /* * A race with de_thread from another thread's exec() * may strip us of our leadership. If this happens, * throw this task away and try again. */ cgroup_attach_unlock(*lock_mode, tsk); put_task_struct(tsk); goto retry_find_task; } } return tsk; out_unlock_rcu: rcu_read_unlock(); return tsk; } void cgroup_procs_write_finish(struct task_struct *task, enum cgroup_attach_lock_mode lock_mode) { cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } /* show controllers which are enabled for a given cgroup's children */ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } /** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; /* * As cgroup_update_dfl_csses() is only called by * cgroup_apply_control(). The csses associated with the * given cgrp will not be affected by changes made to * its subtree_control file. We can skip them. */ if (dsct == cgrp) continue; list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* * We need to write-lock threadgroup_rwsem while migrating tasks. * However, if there are no source csets for @cgrp, changing its * controllers isn't gonna produce any task migrations and the * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); if (has_tasks) lock_mode = CGRP_ATTACH_LOCK_GLOBAL; else lock_mode = CGRP_ATTACH_LOCK_NONE; cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(lock_mode, NULL); return ret; } /** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; restart: cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); cgroup_put(dsct); goto restart; } } } /** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_dom_cgrp = dsct->dom_cgrp; } } /** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled. */ static void cgroup_propagate_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct->subtree_control, cgroup_ss_mask(dsct)); } } /** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->dom_cgrp = dsct->old_dom_cgrp; } } static bool css_visible(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; if (cgroup_control(cgrp) & (1 << ss->id)) return true; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) return false; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; } /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; if (!css) { css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css); } WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; } } } return 0; } /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose. */ static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!css) continue; WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } } } } /** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations. */ static int cgroup_apply_control(struct cgroup *cgrp) { int ret; cgroup_propagate_control(cgrp); ret = cgroup_apply_control_enable(cgrp); if (ret) return ret; /* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ return cgroup_update_dfl_csses(cgrp); } /** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info. */ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) { if (ret) { cgroup_restore_control(cgrp); cgroup_propagate_control(cgrp); } cgroup_apply_control_disable(cgrp); } static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) { u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0; /* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0; if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP; } else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree. */ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0; } /* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks. */ if (cgroup_has_tasks(cgrp)) return -EBUSY; return 0; } /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { enable |= 1 << ssid; disable &= ~(1 << ssid); } else if (*tok == '-') { disable |= 1 << ssid; enable &= ~(1 << ssid); } else { return -EINVAL; } break; } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } } } } if (!enable && !disable) { ret = 0; goto out_unlock; } ret = cgroup_vet_subtree_control_enable(cgrp, enable); if (ret) goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock; kernfs_activate(cgrp->kn); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it. */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); /* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0; /* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly. */ if (cgroup_is_populated(cgrp) || cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP; /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; /* * The following shouldn't cause actual migrations and should * always succeed. */ cgroup_save_control(cgrp); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) if (dsct == cgrp || cgroup_is_threaded(dsct)) dsct->dom_cgrp = dom_cgrp; ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; cgroup_finalize_control(cgrp, ret); return ret; } static int cgroup_type_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; if (cgroup_is_threaded(cgrp)) seq_puts(seq, "threaded\n"); else if (!cgroup_is_valid_domain(cgrp)) seq_puts(seq, "domain invalid\n"); else if (cgroup_is_thread_root(cgrp)) seq_puts(seq, "domain threaded\n"); else seq_puts(seq, "domain\n"); return 0; } static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int ret; /* only switching to threaded mode is supported */ if (strcmp(strstrip(buf), "threaded")) return -EINVAL; /* drain dying csses before we re-apply (threaded) subtree control */ cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; /* threaded can only be enabled */ ret = cgroup_enable_threaded(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_max_descendants_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int descendants = READ_ONCE(cgrp->max_descendants); if (descendants == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", descendants); return 0; } static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int descendants; ssize_t ret; buf = strstrip(buf); if (!strcmp(buf, "max")) { descendants = INT_MAX; } else { ret = kstrtoint(buf, 0, &descendants); if (ret) return ret; } if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_descendants = descendants; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_max_depth_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int depth = READ_ONCE(cgrp->max_depth); if (depth == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", depth); return 0; } static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int depth; buf = strstrip(buf); if (!strcmp(buf, "max")) { depth = INT_MAX; } else { ret = kstrtoint(buf, 0, &depth); if (ret) return ret; } if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_depth = depth; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_events_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); return 0; } static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; struct cgroup_subsys_state *css; int dying_cnt[CGROUP_SUBSYS_COUNT]; int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); /* * Show the number of live and dying csses associated with each of * non-inhibited cgroup subsystems that is bound to cgroup v2. * * Without proper lock protection, racing is possible. So the * numbers may not be consistent when that happens. */ rcu_read_lock(); for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { dying_cnt[ssid] = -1; if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) continue; css = rcu_dereference_raw(cgroup->subsys[ssid]); dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, css ? (css->nr_descendants + 1) : 0); } seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { if (dying_cnt[ssid] >= 0) seq_printf(seq, "nr_dying_subsys_%s %d\n", cgroup_subsys[ssid]->name, dying_cnt[ssid]); } rcu_read_unlock(); return 0; } static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; unsigned int sequence; u64 freeze_time; do { sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); freeze_time = cgrp->freezer.frozen_nsec; /* Add in current freezer interval if the cgroup is freezing. */ if (test_bit(CGRP_FREEZE, &cgrp->flags)) freeze_time += (ktime_get_ns() - cgrp->freezer.freeze_start_nsec); } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); do_div(freeze_time, NSEC_PER_USEC); seq_printf(seq, "frozen_usec %llu\n", freeze_time); return 0; } #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); css = cgroup_css(cgrp, ss); if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); return css; } static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_extra_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_extra_stat_show(seq, css); css_put(css); return ret; } static int cgroup_local_stat_show(struct seq_file *seq, struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_local_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_local_stat_show(seq, css); css_put(css); return ret; } #endif static int cpu_stat_show(struct seq_file *seq, void *v) { int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } static int cpu_local_stat_show(struct seq_file *seq, void *v) { struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; #ifdef CONFIG_CGROUP_SCHED ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; cgroup_get(cgrp); cgroup_kn_unlock(of->kn); /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { cgroup_put(cgrp); return -EBUSY; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; } static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_CPU); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IRQ); } static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IRQ); } #endif static int cgroup_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); seq_printf(seq, "%d\n", psi->enabled); return 0; } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret; int enable; struct cgroup *cgrp; struct psi_group *psi; ret = kstrtoint(strstrip(buf), 0, &enable); if (ret) return ret; if (enable < 0 || enable > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; psi = cgroup_psi(cgrp); if (psi->enabled != enable) { int i; /* show or hide {cpu,memory,io,irq}.pressure files */ for (i = 0; i < NR_PSI_RESOURCES; i++) cgroup_file_show(&cgrp->psi_files[i], enable); psi->enabled = enable; if (enable) psi_cgroup_restart(psi); } cgroup_kn_unlock(of->kn); return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { struct cgroup_file_ctx *ctx = of->priv; return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) { if (static_branch_likely(&psi_disabled)) return false; return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { return false; } #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "%d\n", cgrp->freezer.freeze); return 0; } static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int freeze; ret = kstrtoint(strstrip(buf), 0, &freeze); if (ret) return ret; if (freeze < 0 || freeze > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgroup_freeze(cgrp, freeze); cgroup_kn_unlock(of->kn); return nbytes; } static void __cgroup_kill(struct cgroup *cgrp) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); while ((task = css_task_iter_next(&it))) { /* Ignore kernel threads here. */ if (task->flags & PF_KTHREAD) continue; /* Skip tasks that are already dying. */ if (__fatal_signal_pending(task)) continue; send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); } static void cgroup_kill(struct cgroup *cgrp) { struct cgroup_subsys_state *css; struct cgroup *dsct; lockdep_assert_held(&cgroup_mutex); cgroup_for_each_live_descendant_pre(dsct, css, cgrp) __cgroup_kill(dsct); } static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret = 0; int kill; struct cgroup *cgrp; ret = kstrtoint(strstrip(buf), 0, &kill); if (ret) return ret; if (kill != 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; /* * Killing is a process directed operation, i.e. the whole thread-group * is taken down so act like we do for cgroup.procs and only make this * writable in non-threaded cgroups. */ if (cgroup_is_threaded(cgrp)) ret = -EOPNOTSUPP; else cgroup_kill(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); if (ret) { put_cgroup_ns(ctx->ns); kfree(ctx); } return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; if (!nbytes) return 0; /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) return cft->write(of, buf, nbytes, off); /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css(). */ rcu_read_lock(); css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) ret = cft->write_u64(css, cft, v); } else if (cft->write_s64) { long long v; ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); } else { ret = -EINVAL; } return ret ?: nbytes; } static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); return kernfs_generic_poll(of, pt); } static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { if (seq_cft(seq)->seq_stop) seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); if (cft->seq_show) return cft->seq_show(m, arg); if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else return -EINVAL; return 0; } static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, }; static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, notify_timer)); } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), current_fsuid(), current_fsgid(), 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); } return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; lockdep_assert_held(&cgroup_mutex); restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); cft_end = cft; is_add = false; goto restart; } } else { cgroup_rm_file(cgrp, cft); } } return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } if (is_add && !ret) kernfs_activate(root->kn); return ret; } static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { /* free copy for custom atomic_write_len, see init_cftypes() */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); if (cft->flags & __CFTYPE_ADDED) { ret = -EBUSY; break; } if (cft->seq_start) kf_ops = &cgroup_kf_ops; else kf_ops = &cgroup_kf_single_ops; /* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len. */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { ret = -ENOMEM; break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; cft->flags |= __CFTYPE_ADDED; } if (ret) cgroup_exit_cftypes(cfts); return ret; } static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); } /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered. */ int cgroup_rm_cftypes(struct cftype *cfts) { if (!cfts || cfts[0].name[0] == '\0') return 0; if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT; cgroup_lock(); cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return 0; } /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') return 0; ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return ret; } /** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy. */ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_ONLY_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies. */ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset. */ void cgroup_file_notify(struct cgroup_file *cfile) { unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsigned long last = cfile->notified_at; unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; if (time_in_range(jiffies, last, next)) { timer_reduce(&cfile->notify_timer, next); } else { kernfs_notify(cfile->kn); cfile->notified_at = jiffies; } } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset * @show: whether to show or hide */ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; spin_lock_irq(&cgroup_file_kn_lock); kn = cfile->kn; kernfs_get(kn); spin_unlock_irq(&cgroup_file_kn_lock); if (kn) kernfs_show(kn, show); kernfs_put(kn); } /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk * * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is * that @parent and @pos are accessible. The next sibling is guaranteed to * be returned regardless of their states. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been unlinked from the sibling list. * Once a cgroup is removed, its ->sibling.next is no longer * updated when its next sibling changes. CSS_RELEASED is set when * @pos is taken off list, at which time its next pointer is valid, * and, as releases are serialized, the one pointed to by the next * pointer is guaranteed to not have started release yet. This * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically * increasing unique serial number and always appended to the * sibling list, the next one can be found by walking the parent's * children until the first css with higher serial number than * @pos's. While this path can be slower, it happens iff iteration * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { list_for_each_entry_rcu(next, &parent->children, sibling, lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is * the next sibling. */ if (&next->sibling != &parent->children) return next; return NULL; } /** * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct next descendant as long as both @pos * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { next = css_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; } return NULL; } EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css * @pos: css of interest * * Return the rightmost descendant of @pos. If there's no descendant, @pos * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct rightmost descendant as long as @pos * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; cgroup_assert_mutex_or_rcu_locked(); do { last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last; do { last = pos; pos = css_next_child(NULL, pos); } while (pos); return last; } /** * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ return pos->parent; } /** * css_has_online_children - does a css have online children * @css: the target css * * Returns %true if @css has any online children; otherwise, %false. This * function can be called from any context but the caller is responsible * for synchronizing against on/offlining as necessary. */ bool css_has_online_children(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *child; bool ret = false; rcu_read_lock(); css_for_each_child(child, css) { if (child->flags & CSS_ONLINE) { ret = true; break; } } rcu_read_unlock(); return ret; } static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) { struct list_head *l; struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* find the next threaded cset */ if (it->tcset_pos) { l = it->tcset_pos->next; if (l != it->tcset_head) { it->tcset_pos = l; return container_of(l, struct css_set, threaded_csets_node); } it->tcset_pos = NULL; } /* find the next cset */ l = it->cset_pos; l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; return NULL; } if (it->ss) { cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); } else { link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } it->cset_pos = l; /* initialize threaded css_set walking */ if (it->flags & CSS_TASK_ITER_THREADED) { if (it->cur_dcset) put_css_set_locked(it->cur_dcset); it->cur_dcset = cset; get_css_set(cset); it->tcset_head = &cset->threaded_csets; it->tcset_pos = &cset->threaded_csets; } return cset; } /** * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set and find first non-empty tasks list*/ while ((cset = css_task_iter_next_css_set(it))) { if (!list_empty(&cset->tasks)) { it->cur_tasks_head = &cset->tasks; break; } else if (!list_empty(&cset->mg_tasks)) { it->cur_tasks_head = &cset->mg_tasks; break; } else if (!list_empty(&cset->dying_tasks)) { it->cur_tasks_head = &cset->dying_tasks; break; } } if (!cset) { it->task_pos = NULL; return; } it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus * need to take steps to ensure that iteration can be resumed after * the lock is re-acquired. Iteration is performed at two levels - * css_sets and tasks in them. * * Once created, a css_set never leaves its cgroup lists, so a * pinned css_set is guaranteed to stay put and we can resume * iteration afterwards. * * Tasks may leave @cset across iteration steps. This is resolved * by registering each iterator with the css_set currently being * walked and making css_set_move_task() advance iterators whose * next task is leaving. */ if (it->cur_cset) { list_del(&it->iters_node); put_css_set_locked(it->cur_cset); } get_css_set(cset); it->cur_cset = cset; list_add(&it->iters_node, &cset->task_iters); } static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task) { lockdep_assert_held(&css_set_lock); if (it->task_pos == &task->cg_list) { it->task_pos = it->task_pos->next; it->flags |= CSS_TASK_ITER_SKIPPED; } } static void css_task_iter_advance(struct css_task_iter *it) { struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { /* * Advance iterator to find next entry. We go through cset * tasks, mg_tasks and dying_tasks, when consumed we move onto * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; if (it->task_pos == &it->cur_cset->tasks) { it->cur_tasks_head = &it->cur_cset->mg_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->mg_tasks) { it->cur_tasks_head = &it->cur_cset->dying_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } if (!it->task_pos) return; task = list_entry(it->task_pos, struct task_struct, cg_list); if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ if (!thread_group_leader(task)) goto repeat; /* and dying leaders w/o live member threads */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { unsigned long irqflags; memset(it, 0, sizeof(*it)); spin_lock_irqsave(&css_set_lock, irqflags); it->ss = css->ss; it->flags = flags; if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; it->cset_head = it->cset_pos; css_task_iter_advance(it); spin_unlock_irqrestore(&css_set_lock, irqflags); } /** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via css_task_iter_start(). Returns NULL when the iteration * reaches the end. */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { unsigned long irqflags; if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } spin_lock_irqsave(&css_set_lock, irqflags); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) css_task_iter_advance(it); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); get_task_struct(it->cur_task); css_task_iter_advance(it); } spin_unlock_irqrestore(&css_set_lock, irqflags); return it->cur_task; } /** * css_task_iter_end - finish task iteration * @it: the task iterator to finish * * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) { unsigned long irqflags; if (it->cur_cset) { spin_lock_irqsave(&css_set_lock, irqflags); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); spin_unlock_irqrestore(&css_set_lock, irqflags); } if (it->cur_dcset) put_css_set(it->cur_dcset); if (it->cur_task) put_task_struct(it->cur_task); } static void cgroup_procs_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; if (ctx->procs.started) css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_file_ctx *ctx = of->priv; struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); css_task_iter_start(&cgrp->self, iter_flags, it); ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); } else return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { struct cgroup *cgrp = seq_css(s)->cgroup; /* * All processes of a threaded subtree belong to the domain cgroup * of the subtree. Only threads can be distributed across the * subtree. Reject reads on cgroup.procs in the subtree proper. * They're always empty anyway. */ if (cgroup_is_threaded(cgrp)) return ERR_PTR(-EOPNOTSUPP); return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED); } static int cgroup_procs_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) { int ret; struct inode *inode; lockdep_assert_held(&cgroup_mutex); inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (!inode) return -ENOMEM; ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, struct cgroup_namespace *ns) { struct cgroup *com_cgrp = src_cgrp; int ret; lockdep_assert_held(&cgroup_mutex); /* find the common ancestor */ while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; /* * If namespaces are delegation boundaries, %current must be able * to see both source and destination cgroups from its namespace. */ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) return -ENOENT; return 0; } static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, bool threadgroup, struct cgroup_namespace *ns) { int ret = 0; ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; ret = cgroup_migrate_vet_dst(dst_cgrp); if (ret) return ret; if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) ret = -EOPNOTSUPP; return ret; } static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* find the source cgroup */ spin_lock_irq(&css_set_lock); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); /* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks. */ scoped_with_creds(of->file->f_cred) ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup, ctx->ns); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); return ret; } static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) { return __cgroup_procs_start(s, pos, 0); } static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { .name = "cgroup.type", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_type_show, .write = cgroup_type_write, }, { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { .name = "cgroup.threads", .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_threads_write, }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, { .name = "cgroup.max.descendants", .seq_show = cgroup_max_descendants_show, .write = cgroup_max_descendants_write, }, { .name = "cgroup.max.depth", .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, { .name = "cgroup.stat.local", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_core_local_stat_show, }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, .write = cgroup_freeze_write, }, { .name = "cgroup.kill", .flags = CFTYPE_NOT_ON_ROOT, .write = cgroup_kill_write, }, { .name = "cpu.stat", .seq_show = cpu_stat_show, }, { .name = "cpu.stat.local", .seq_show = cpu_local_stat_show, }, { } /* terminate */ }; static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #ifdef CONFIG_IRQ_TIME_ACCOUNTING { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #endif { .name = "cgroup.pressure", .seq_show = cgroup_pressure_show, .write = cgroup_pressure_write, }, #endif /* CONFIG_PSI */ { } /* terminate */ }; /* * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be * offlined by invoking offline_css(). After offlining, the base ref is * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ static void css_free_rwork_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); css_rstat_exit(css); if (!css_is_self(css)) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); if (parent) css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); if (!cgroup_on_dfl(cgrp)) cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); kfree(cgrp); } else { /* * This is root cgroup's refcnt reaching zero, * which indicates that the root should be * released. */ cgroup_destroy_root(cgrp->root); } } } static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (!css_is_self(css)) { struct cgroup *parent_cgrp; css_rstat_flush(css); cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); cgrp->nr_dying_subsys[ss->id]--; /* * When a css is released and ready to be freed, its * nr_descendants must be zero. However, the corresponding * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem * is activated and deactivated multiple times with one or * more of its previous activation leaving behind dying csses. */ WARN_ON_ONCE(css->nr_descendants); parent_cgrp = cgroup_parent(cgrp); while (parent_cgrp) { parent_cgrp->nr_dying_subsys[ss->id]--; parent_cgrp = cgroup_parent(parent_cgrp); } } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); css_rstat_flush(&cgrp->self); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - * cgroupstats_build() and css_tryget_online_from_dir(). * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); if (css->parent) { atomic_inc(&css->parent->online_cnt); while ((css = css->parent)) css->nr_descendants++; } } return ret; } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) return; if (ss->css_offline) ss->css_offline(css); css->flags &= ~CSS_ONLINE; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); css->cgroup->nr_dying_subsys[ss->id]++; /* * Parent css and cgroup cannot be freed until after the freeing * of child css, see css_free_rwork_fn(). */ while ((css = css->parent)) { css->nr_descendants--; css->cgroup->nr_dying_subsys[ss->id]++; } } /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css * * Create a new css associated with @cgrp - @ss pair. On success, the new * css is online and installed in @cgrp. This function doesn't create the * interface files. Returns 0 on success, -errno on failure. */ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); if (!css) css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_css; css->id = err; err = css_rstat_init(css); if (err) goto err_free_css; /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) goto err_list_del; return css; err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } /* * The returned cgroup is fully initialized including its control mask, but * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; int i, level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_cancel_ref; } cgrp->kn = kn; init_cgroup_housekeeping(cgrp); cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; /* * Now that init_cgroup_housekeeping() has been called and cgrp->self * is setup, it is safe to perform rstat initialization on it. */ ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove; ret = psi_cgroup_alloc(cgrp); if (ret) goto out_stat_exit; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) cgrp->ancestors[tcgrp->level] = tcgrp; /* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be * attached to the child cgroup, it will become frozen. * At this point the new cgroup is unpopulated, so we can * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); cgrp->self.serial_nr = css_serial_nr_next++; ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier, CGROUP_LIFETIME_ONLINE, CGROUP_LIFETIME_OFFLINE, cgrp); ret = notifier_to_errno(ret); if (ret) goto out_psi_free; /* allocation complete, commit to creation */ spin_lock_irq(&css_set_lock); for (i = 0; i < level; i++) { tcgrp = cgrp->ancestors[i]; tcgrp->nr_descendants++; /* * If the new cgroup is frozen, all ancestor cgroups get a new * frozen descendant, but their state can't change because of * this. */ if (cgrp->freezer.e_freeze) tcgrp->freezer.nr_frozen_descendants++; } spin_unlock_irq(&css_set_lock); list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); cgroup_propagate_control(cgrp); return cgrp; out_psi_free: psi_cgroup_free(cgrp); out_stat_exit: css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); return ERR_PTR(ret); } static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; int level = 0; lockdep_assert_held(&cgroup_mutex); for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; if (level >= cgroup->max_depth) goto fail; level++; } ret = true; fail: return ret; } int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(name, '\n')) return -EINVAL; parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; if (!cgroup_check_hierarchy_limits(parent)) { ret = -EAGAIN; goto out_unlock; } cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } /* * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; out_destroy: cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; } /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_lock(); do { offline_css(css); css_put(css); /* @css can't go away while we're holding cgroup_mutex */ css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); queue_work(cgroup_offline_wq, &css->destroy_work); } } /** * kill_css - destroy a css * @css: css to destroy * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked * asynchronously once css_tryget_online() is guaranteed to fail and when * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) return; /* * Call css_killed(), if defined, before setting the CSS_DYING flag */ if (css->ss->css_killed) css->ss->css_killed(css); css->flags |= CSS_DYING; /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). */ css_get(css); /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to * guarantee that css_tryget_online() won't succeed by the time * ->css_offline() is invoked. To satisfy all the requirements, * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of * css's. Set up so that the next stage will be kicked off once all * the percpu refcnts are confirmed to be killed. * * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the * rest of destruction. Once all cgroup references are gone, the * cgroup is RCU-freed. * * This function implements s1. After this step, @cgrp is gone as far as * the userland is concerned and a new cgroup with the same name may be * created. As cgroup doesn't care about the names internally, this * doesn't cause any problem. */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid, ret; lockdep_assert_held(&cgroup_mutex); /* * Only migration can raise populated from zero and we're already * holding cgroup_mutex. */ if (cgroup_is_populated(cgrp)) return -EBUSY; /* * Make sure there's no live children. We can't test emptiness of * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ if (css_has_online_children(&cgrp->self)) return -EBUSY; /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* * If the dying cgroup is frozen, decrease frozen descendants * counters of ancestor cgroups. */ if (test_bit(CGRP_FROZEN, &cgrp->flags)) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, CGROUP_LIFETIME_OFFLINE, cgrp); WARN_ON_ONCE(notifier_to_errno(ret)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); return 0; }; int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; ret = cgroup_destroy_locked(cgrp); if (!ret) TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; pr_debug("Initializing cgroup subsys %s\n", ss->name); cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* * Root csses are never destroyed and we can't initialize * percpu_ref during early init. Disable refcnting. */ css->flags |= CSS_NO_REF; if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); BUG_ON(ss_rstat_init(ss)); BUG_ON(css_rstat_init(css)); } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); BUG_ON(online_css(css)); cgroup_unlock(); } /** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init. */ int __init cgroup_init_early(void) { static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; ctx.root = &cgrp_dfl_root; init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); WARN(ss->early_init && ss->css_rstat_flush, "cgroup rstat cannot be used with early init subsystem\n"); ss->id = i; ss->name = cgroup_subsys_name[i]; if (!ss->legacy_name) ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); } return 0; } /** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init. */ int __init cgroup_init(void) { struct cgroup_subsys *ss; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); cgroup_rt_init(); cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to * it during init. */ hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); cgroup_bpf_lifetime_notifier_init(); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } else { cgroup_init_subsys(ss, false); } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); /* * Setting dfl_root subsys_mask needs to consider the * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ if (!cgroup_ssid_enabled(ssid)) continue; if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; /* implicit controllers must be threaded too */ WARN_ON(ss->implicit_on_dfl && !ss->threaded); if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->threaded) cgrp_dfl_threaded_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } if (ss->bind) ss->bind(init_css_set.subsys[ssid]); cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ hash_del(&init_css_set.hlist); hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); #ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif ns_tree_add(&init_cgroup_ns); return 0; } static int __init cgroup_wq_init(void) { /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); BUG_ON(!cgroup_offline_wq); cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); BUG_ON(!cgroup_release_wq); cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); kernfs_put(kn); } /* * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * There are no cgroup NS restrictions. */ struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return ERR_PTR(-ENOENT); if (kernfs_type(kn) != KERNFS_DIR) { kernfs_put(kn); return ERR_PTR(-ENOENT); } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; rcu_read_unlock(); kernfs_put(kn); if (!cgrp) return ERR_PTR(-ENOENT); return cgrp; } /* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * Only cgroups within current task's cgroup NS are valid. */ struct cgroup *cgroup_get_from_id(u64 id) { struct cgroup *cgrp, *root_cgrp; cgrp = __cgroup_get_from_id(id); if (IS_ERR(cgrp)) return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); return ERR_PTR(-ENOENT); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { char *buf; int retval; struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; cgrp = task_cgroup_from_root(tsk, root); /* The root has already been unmounted. */ if (!cgrp) continue; seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, * while a zombie doesn't show up in "cgroup.procs" and * thus can't be migrated, its /proc/PID/cgroup keeps * reporting the cgroup it belonged to before exiting. If * the cgroup is removed before the zombie is reaped, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; seq_puts(m, buf); } else { seq_puts(m, "/"); } if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else seq_putc(m, '\n'); } retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); rcu_read_unlock(); kfree(buf); out: return retval; } /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } /** * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer * @f: file corresponding to cgroup_dir * * Find the cgroup from a file pointer associated with a cgroup directory. * Returns a pointer to the cgroup on success. ERR_PTR is returned if the * cgroup cannot be found. */ static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); return css->cgroup; } /** * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports * cgroup2. * @f: file corresponding to cgroup2_dir */ static struct cgroup *cgroup_get_from_file(struct file *f) { struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } /** * cgroup_css_set_fork - find or create a css_set for a child process * @kargs: the arguments passed to create the child process * * This functions finds or creates a new css_set which the child * process will be attached to in cgroup_post_fork(). By default, * the child process will be given the same css_set as its parent. * * If CLONE_INTO_CGROUP is specified this function will try to find an * existing css_set which includes the requested cgroup and if not create * a new css_set that the child will be attached to later. If this function * succeeds it will hold cgroup_threadgroup_rwsem on return. If * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex * before grabbing cgroup_threadgroup_rwsem and will hold a reference * to the target cgroup. */ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) { int ret; struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); cgroup_threadgroup_change_begin(current); spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); if (kargs->cgrp) kargs->kill_seq = kargs->cgrp->kill_seq; else kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { kargs->cset = cset; return 0; } CLASS(fd_raw, f)(kargs->cgroup); if (fd_empty(f)) { ret = -EBADF; goto err; } sb = fd_file(f)->f_path.dentry->d_sb; dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; goto err; } if (cgroup_is_dead(dst_cgrp)) { ret = -ENODEV; goto err; } /* * Verify that we the target cgroup is writable for us. This is * usually done by the vfs layer but since we're not going through * the vfs layer here we need to do it "manually". */ ret = cgroup_may_write(dst_cgrp, sb); if (ret) goto err; /* * Spawning a task directly into a cgroup works by passing a file * descriptor to the target cgroup directory. This can even be an O_PATH * file descriptor. But it can never be a cgroup.procs file descriptor. * This was done on purpose so spawning into a cgroup could be * conceptualized as an atomic * * fd = openat(dfd_cgroup, "cgroup.procs", ...); * write(fd, <child-pid>, ...); * * sequence, i.e. it's a shorthand for the caller opening and writing * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us * to always use the caller's credentials. */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); if (ret) goto err; kargs->cset = find_css_set(cset, dst_cgrp); if (!kargs->cset) { ret = -ENOMEM; goto err; } put_css_set(cset); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); if (kargs->cset) put_css_set(kargs->cset); return ret; } /** * cgroup_css_set_put_fork - drop references we took during fork * @kargs: the arguments passed to create the child process * * Drop references to the prepared css_set and target cgroup if * CLONE_INTO_CGROUP was requested. */ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset; cgroup_threadgroup_change_end(current); if (cset) { put_css_set(cset); kargs->cset = NULL; } if (kargs->flags & CLONE_INTO_CGROUP) { cgroup_unlock(); if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; } } } /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks. */ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; ret = cgroup_css_set_fork(kargs); if (ret) return ret; do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); return 0; out_revert: for_each_subsys(ss, j) { if (j >= i) break; if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); } cgroup_css_set_put_fork(kargs); return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * @child: the child process * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); cgroup_css_set_put_fork(kargs); } /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. */ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; cset = kargs->cset; kargs->cset = NULL; spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; cgrp_kill_seq = kargs->cgrp->kill_seq; } else { cgrp_flags = cset->dfl_cgrp->flags; cgrp_kill_seq = cset->dfl_cgrp->kill_seq; } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } else { put_css_set(cset); cset = NULL; } if (!(child->flags & PF_KTHREAD)) { if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { /* * If the cgroup has to be frozen, the new task has * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to * get the task into the frozen state. */ spin_lock(&child->sighand->siglock); WARN_ON_ONCE(child->frozen); child->jobctl |= JOBCTL_TRAP_FREEZE; spin_unlock(&child->sighand->siglock); /* * Calling cgroup_update_frozen() isn't required here, * because it will be called anyway a bit later from * do_freezer_trap(). So we avoid cgroup's transient * switch from the frozen state and back. */ } /* * If the cgroup is to be killed notice it now and take the * child down right after we finished preparing it for * userspace. */ kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); /* Make the new cset the root_cset of the new cgroup namespace. */ if (kargs->flags & CLONE_NEWCGROUP) { struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; get_css_set(cset); child->nsproxy->cgroup_ns->root_cset = cset; put_css_set(rcset); } /* Cgroup has to be killed so take down child immediately. */ if (unlikely(kill)) do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); cgroup_css_set_put_fork(kargs); } /** * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; int i; /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); } while_each_subsys_mask(); } static void do_cgroup_task_dead(struct task_struct *tsk) { struct css_set *cset; unsigned long flags; spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; /* matches the signal->live check in css_task_iter_advance() */ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); if (dl_task(tsk)) dec_dl_tasks_cs(tsk); WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irqrestore(&css_set_lock, flags); } #ifdef CONFIG_PREEMPT_RT /* * cgroup_task_dead() is called from finish_task_switch() which doesn't allow * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, * this lead to sleeping in the invalid context warning bug. css_set_lock is too * big to become a raw_spinlock. The task_dead path doesn't need to run * synchronously but can't be delayed indefinitely either as the dead task pins * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy * irq_work to allow batching while ensuring timely completion. */ static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) { struct llist_node *lnode; struct task_struct *task, *next; lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { do_cgroup_task_dead(task); put_task_struct(task); } } static void __init cgroup_rt_init(void) { int cpu; for_each_possible_cpu(cpu) { init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); per_cpu(cgrp_dead_tasks_iwork, cpu) = IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); } } void cgroup_task_dead(struct task_struct *task) { get_task_struct(task); llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); } #else /* CONFIG_PREEMPT_RT */ static void __init cgroup_rt_init(void) {} void cgroup_task_dead(struct task_struct *task) { do_cgroup_task_dead(task); } #endif /* CONFIG_PREEMPT_RT */ void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); } void cgroup_task_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); css_set_skip_task_iters(task_css_set(task), task); list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } put_css_set(cset); } static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; static_branch_disable(cgroup_subsys_enabled_key[i]); pr_info("Disabling %s control group subsystem\n", ss->name); } for (i = 0; i < OPT_FEATURE_COUNT; i++) { if (strcmp(token, cgroup_opt_feature_names[i])) continue; cgroup_feature_disable_mask |= 1 << i; pr_info("Disabling %s control group feature\n", cgroup_opt_feature_names[i]); break; } } return 1; } __setup("cgroup_disable=", cgroup_disable); void __init __weak enable_debug_cgroup(void) { } static int __init enable_cgroup_debug(char *str) { cgroup_debug = true; enable_debug_cgroup(); return 1; } __setup("cgroup_debug", enable_cgroup_debug); static int __init cgroup_favordynmods_setup(char *str) { return (kstrtobool(str, &have_favordynmods) == 0); } __setup("cgroup_favordynmods=", cgroup_favordynmods_setup); /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * * If @dentry is a directory for a cgroup which has @ss enabled on it, try * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); return css; } /** * css_from_id - lookup css by id * @id: the cgroup id * @ss: cgroup subsys to be looked into * * Returns the css if there's valid one with @id, otherwise returns NULL. * Should be called under rcu_read_lock(). */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&ss->css_idr, id); } /** * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path * @path: path on the default hierarchy * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp; root_cgrp = current_cgns_cgroup_dfl(); kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; if (kernfs_type(kn) != KERNFS_DIR) { cgrp = ERR_PTR(-ENOTDIR); goto out_kernfs; } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); rcu_read_unlock(); out_kernfs: kernfs_put(kn); out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return ERR_PTR(-EBADF); return cgroup_v1v2_get_from_file(fd_file(f)); } /** * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports * cgroup2. * @fd: fd obtained by open(cgroup2_dir) */ struct cgroup *cgroup_get_from_fd(int fd) { struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) { u64 v = 1; while (power--) v *= 10; return v; } /** * cgroup_parse_float - parse a floating number * @input: input string * @dec_shift: number of decimal digits to shift * @v: output * * Parse a decimal floating point number in @input and store the result in * @v with decimal point right shifted @dec_shift times. For example, if * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. * Returns 0 on success, -errno otherwise. * * There's nothing cgroup specific about this function except that it's * currently the only user. */ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) { s64 whole, frac = 0; int fstart = 0, fend = 0, flen; if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) return -EINVAL; if (frac < 0) return -EINVAL; flen = fend > fstart ? fend - fstart : 0; if (flen < dec_shift) frac *= power_of_ten(dec_shift - flen); else frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); *v = whole * power_of_ten(dec_shift) + frac; return 0; } /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { struct cgroup *cgroup; rcu_read_lock(); /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) { cgroup = &cgrp_dfl_root.cgrp; cgroup_get(cgroup); goto out; } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { cgroup = cset->dfl_cgrp; break; } cpu_relax(); } out: skcd->cgroup = cgroup; cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); /* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd. * Don't use cgroup_get_live(). */ cgroup_get(cgrp); cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); cgroup_bpf_put(cgrp); cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) { struct cftype *cft; ssize_t ret = 0; for (cft = files; cft && cft->name[0] != '\0'; cft++) { if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); if (WARN_ON(ret >= size)) break; } return ret; } static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct cgroup_subsys *ss; int ssid; ssize_t ret = 0; ret = show_delegatable_files(cgroup_base_files, buf + ret, PAGE_SIZE - ret, NULL); if (cgroup_psi_enabled()) ret += show_delegatable_files(cgroup_psi_files, buf + ret, PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, PAGE_SIZE - ret, cgroup_subsys_name[ssid]); return ret; } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n" "memory_hugetlb_accounting\n" "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, &cgroup_features_attr.attr, NULL, }; static const struct attribute_group cgroup_sysfs_attr_group = { .attrs = cgroup_sysfs_attrs, .name = "cgroup", }; static int __init cgroup_sysfs_init(void) { return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); #endif /* CONFIG_SYSFS */
13 7 6 1 12 10 1 9 3 3 3 1 2 1 7 2 2 2 7 6 6 6 1 6 1 6 6 6 2 13 1 5 5 5 5 5 5 5 1 1 5 1 5 5 5 5 5 2 1 95 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0+ /* net/sched/act_ctinfo.c netfilter ctinfo connmark actions * * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ctinfo.h> #include <net/tc_act/tc_ctinfo.h> #include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> static struct tc_action_ops act_ctinfo_ops; static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb, int wlen, int proto) { u8 dscp, newdscp; newdscp = (((READ_ONCE(ct->mark) & cp->dscpmask) >> cp->dscpmaskshift) << 2) & ~INET_ECN_MASK; switch (proto) { case NFPROTO_IPV4: dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK; if (dscp != newdscp) { if (likely(!skb_try_make_writable(skb, wlen))) { ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); atomic64_inc(&ca->stats_dscp_set); } else { atomic64_inc(&ca->stats_dscp_error); } } break; case NFPROTO_IPV6: dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK; if (dscp != newdscp) { if (likely(!skb_try_make_writable(skb, wlen))) { ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); atomic64_inc(&ca->stats_dscp_set); } else { atomic64_inc(&ca->stats_dscp_error); } } break; default: break; } } static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { atomic64_inc(&ca->stats_cpmark_set); skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash = NULL; struct tcf_ctinfo *ca = to_ctinfo(a); struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; enum ip_conntrack_info ctinfo; struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV4; break; case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV6; break; default: goto out; } ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* look harder, usually ingress */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, cp->net, &tuple)) goto out; zone.id = cp->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; thash = nf_conntrack_find_get(cp->net, &zone, &tuple); if (!thash) goto out; ct = nf_ct_tuplehash_to_ctrack(thash); } if (cp->mode & CTINFO_MODE_DSCP) if (!cp->dscpstatemask || (READ_ONCE(ct->mark) & cp->dscpstatemask)) tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto); if (cp->mode & CTINFO_MODE_CPMARK) tcf_ctinfo_cpmark_set(ct, ca, cp, skb); if (thash) nf_ct_put(ct); out: return cp->action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { [TCA_CTINFO_ACT] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ctinfo)), [TCA_CTINFO_ZONE] = { .type = NLA_U16 }, [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 }, [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 }, }; static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; u32 dscpmask = 0, dscpstatemask, index; struct nlattr *tb[TCA_CTINFO_MAX + 1]; struct tcf_ctinfo_params *cp_new; struct tcf_chain *goto_ch = NULL; struct tc_ctinfo *actparm; struct tcf_ctinfo *ci; u8 dscpmaskshift; int ret = 0, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "ctinfo requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, extack); if (err < 0) return err; if (!tb[TCA_CTINFO_ACT]) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_CTINFO_ACT attribute"); return -EINVAL; } actparm = nla_data(tb[TCA_CTINFO_ACT]); /* do some basic validation here before dynamically allocating things */ /* that we would otherwise have to clean up. */ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) { dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]); /* need contiguous 6 bit mask */ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0; if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CTINFO_PARMS_DSCP_MASK], "dscp mask must be 6 contiguous bits"); return -EINVAL; } dscpstatemask = nla_get_u32_default(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], 0); /* mask & statemask must not overlap */ if (dscpmask & dscpstatemask) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], "dscp statemask must not overlap dscp mask"); return -EINVAL; } } /* done the validation:now to the actual action allocation */ index = actparm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_ctinfo_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) /* don't override defaults */ return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } else { return err; } err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; ci = to_ctinfo(*a); cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); if (unlikely(!cp_new)) { err = -ENOMEM; goto put_chain; } cp_new->net = net; cp_new->zone = nla_get_u16_default(tb[TCA_CTINFO_ZONE], 0); if (dscpmask) { cp_new->dscpmask = dscpmask; cp_new->dscpmaskshift = dscpmaskshift; cp_new->dscpstatemask = dscpstatemask; cp_new->mode |= CTINFO_MODE_DSCP; } if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) { cp_new->cpmarkmask = nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]); cp_new->mode |= CTINFO_MODE_CPMARK; } cp_new->action = actparm->action; spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (cp_new) kfree_rcu(cp_new, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { const struct tcf_ctinfo *ci = to_ctinfo(a); unsigned char *b = skb_tail_pointer(skb); const struct tcf_ctinfo_params *cp; struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; struct tcf_t t; rcu_read_lock(); cp = rcu_dereference(ci->params); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; opt.action = cp->action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone)) goto nla_put_failure; if (cp->mode & CTINFO_MODE_DSCP) { if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK, cp->dscpmask)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK, cp->dscpstatemask)) goto nla_put_failure; } if (cp->mode & CTINFO_MODE_CPMARK) { if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK, cp->cpmarkmask)) goto nla_put_failure; } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, atomic64_read(&ci->stats_dscp_set), TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, atomic64_read(&ci->stats_dscp_error), TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, atomic64_read(&ci->stats_cpmark_set), TCA_CTINFO_PAD)) goto nla_put_failure; rcu_read_unlock(); return skb->len; nla_put_failure: rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } static void tcf_ctinfo_cleanup(struct tc_action *a) { struct tcf_ctinfo *ci = to_ctinfo(a); struct tcf_ctinfo_params *cp; cp = rcu_dereference_protected(ci->params, 1); if (cp) kfree_rcu(cp, rcu); } static struct tc_action_ops act_ctinfo_ops = { .kind = "ctinfo", .id = TCA_ID_CTINFO, .owner = THIS_MODULE, .act = tcf_ctinfo_act, .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, .cleanup= tcf_ctinfo_cleanup, .size = sizeof(struct tcf_ctinfo), }; MODULE_ALIAS_NET_ACT("ctinfo"); static __net_init int ctinfo_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); return tc_action_net_init(net, tn, &act_ctinfo_ops); } static void __net_exit ctinfo_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ctinfo_ops.net_id); } static struct pernet_operations ctinfo_net_ops = { .init = ctinfo_init_net, .exit_batch = ctinfo_exit_net, .id = &act_ctinfo_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ctinfo_init_module(void) { return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops); } static void __exit ctinfo_cleanup_module(void) { tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops); } module_init(ctinfo_init_module); module_exit(ctinfo_cleanup_module); MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>"); MODULE_DESCRIPTION("Connection tracking mark actions"); MODULE_LICENSE("GPL");
30 30 30 12 12 12 7 7 7 7 3 24 24 24 24 5 5 5 5 5 5 5 5 5 5 5 5 30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2012 by Alan Stern */ /* This file is part of ehci-hcd.c */ /*-------------------------------------------------------------------------*/ /* Set a bit in the USBCMD register */ static void ehci_set_command_bit(struct ehci_hcd *ehci, u32 bit) { ehci->command |= bit; ehci_writel(ehci, ehci->command, &ehci->regs->command); /* unblock posted write */ ehci_readl(ehci, &ehci->regs->command); } /* Clear a bit in the USBCMD register */ static void ehci_clear_command_bit(struct ehci_hcd *ehci, u32 bit) { ehci->command &= ~bit; ehci_writel(ehci, ehci->command, &ehci->regs->command); /* unblock posted write */ ehci_readl(ehci, &ehci->regs->command); } /*-------------------------------------------------------------------------*/ /* * EHCI timer support... Now using hrtimers. * * Lots of different events are triggered from ehci->hrtimer. Whenever * the timer routine runs, it checks each possible event; events that are * currently enabled and whose expiration time has passed get handled. * The set of enabled events is stored as a collection of bitflags in * ehci->enabled_hrtimer_events, and they are numbered in order of * increasing delay values (ranging between 1 ms and 100 ms). * * Rather than implementing a sorted list or tree of all pending events, * we keep track only of the lowest-numbered pending event, in * ehci->next_hrtimer_event. Whenever ehci->hrtimer gets restarted, its * expiration time is set to the timeout value for this event. * * As a result, events might not get handled right away; the actual delay * could be anywhere up to twice the requested delay. This doesn't * matter, because none of the events are especially time-critical. The * ones that matter most all have a delay of 1 ms, so they will be * handled after 2 ms at most, which is okay. In addition to this, we * allow for an expiration range of 1 ms. */ /* * Delay lengths for the hrtimer event types. * Keep this list sorted by delay length, in the same order as * the event types indexed by enum ehci_hrtimer_event in ehci.h. */ static unsigned event_delays_ns[] = { 1 * NSEC_PER_MSEC, /* EHCI_HRTIMER_POLL_ASS */ 1 * NSEC_PER_MSEC, /* EHCI_HRTIMER_POLL_PSS */ 1 * NSEC_PER_MSEC, /* EHCI_HRTIMER_POLL_DEAD */ 1125 * NSEC_PER_USEC, /* EHCI_HRTIMER_UNLINK_INTR */ 2 * NSEC_PER_MSEC, /* EHCI_HRTIMER_FREE_ITDS */ 2 * NSEC_PER_MSEC, /* EHCI_HRTIMER_ACTIVE_UNLINK */ 5 * NSEC_PER_MSEC, /* EHCI_HRTIMER_START_UNLINK_INTR */ 6 * NSEC_PER_MSEC, /* EHCI_HRTIMER_ASYNC_UNLINKS */ 10 * NSEC_PER_MSEC, /* EHCI_HRTIMER_IAA_WATCHDOG */ 10 * NSEC_PER_MSEC, /* EHCI_HRTIMER_DISABLE_PERIODIC */ 15 * NSEC_PER_MSEC, /* EHCI_HRTIMER_DISABLE_ASYNC */ 100 * NSEC_PER_MSEC, /* EHCI_HRTIMER_IO_WATCHDOG */ }; /* Enable a pending hrtimer event */ static void ehci_enable_event(struct ehci_hcd *ehci, unsigned event, bool resched) { ktime_t *timeout = &ehci->hr_timeouts[event]; if (resched) *timeout = ktime_add(ktime_get(), event_delays_ns[event]); ehci->enabled_hrtimer_events |= (1 << event); /* Track only the lowest-numbered pending event */ if (event < ehci->next_hrtimer_event) { ehci->next_hrtimer_event = event; hrtimer_start_range_ns(&ehci->hrtimer, *timeout, NSEC_PER_MSEC, HRTIMER_MODE_ABS); } } /* Poll the STS_ASS status bit; see when it agrees with CMD_ASE */ static void ehci_poll_ASS(struct ehci_hcd *ehci) { unsigned actual, want; /* Don't enable anything if the controller isn't running (e.g., died) */ if (ehci->rh_state != EHCI_RH_RUNNING) return; want = (ehci->command & CMD_ASE) ? STS_ASS : 0; actual = ehci_readl(ehci, &ehci->regs->status) & STS_ASS; if (want != actual) { /* Poll again later, but give up after about 2-4 ms */ if (ehci->ASS_poll_count++ < 2) { ehci_enable_event(ehci, EHCI_HRTIMER_POLL_ASS, true); return; } ehci_dbg(ehci, "Waited too long for the async schedule status (%x/%x), giving up\n", want, actual); } ehci->ASS_poll_count = 0; /* The status is up-to-date; restart or stop the schedule as needed */ if (want == 0) { /* Stopped */ if (ehci->async_count > 0) ehci_set_command_bit(ehci, CMD_ASE); } else { /* Running */ if (ehci->async_count == 0) { /* Turn off the schedule after a while */ ehci_enable_event(ehci, EHCI_HRTIMER_DISABLE_ASYNC, true); } } } /* Turn off the async schedule after a brief delay */ static void ehci_disable_ASE(struct ehci_hcd *ehci) { ehci_clear_command_bit(ehci, CMD_ASE); } /* Poll the STS_PSS status bit; see when it agrees with CMD_PSE */ static void ehci_poll_PSS(struct ehci_hcd *ehci) { unsigned actual, want; /* Don't do anything if the controller isn't running (e.g., died) */ if (ehci->rh_state != EHCI_RH_RUNNING) return; want = (ehci->command & CMD_PSE) ? STS_PSS : 0; actual = ehci_readl(ehci, &ehci->regs->status) & STS_PSS; if (want != actual) { /* Poll again later, but give up after about 2-4 ms */ if (ehci->PSS_poll_count++ < 2) { ehci_enable_event(ehci, EHCI_HRTIMER_POLL_PSS, true); return; } ehci_dbg(ehci, "Waited too long for the periodic schedule status (%x/%x), giving up\n", want, actual); } ehci->PSS_poll_count = 0; /* The status is up-to-date; restart or stop the schedule as needed */ if (want == 0) { /* Stopped */ if (ehci->periodic_count > 0) ehci_set_command_bit(ehci, CMD_PSE); } else { /* Running */ if (ehci->periodic_count == 0) { /* Turn off the schedule after a while */ ehci_enable_event(ehci, EHCI_HRTIMER_DISABLE_PERIODIC, true); } } } /* Turn off the periodic schedule after a brief delay */ static void ehci_disable_PSE(struct ehci_hcd *ehci) { ehci_clear_command_bit(ehci, CMD_PSE); } /* Poll the STS_HALT status bit; see when a dead controller stops */ static void ehci_handle_controller_death(struct ehci_hcd *ehci) { if (!(ehci_readl(ehci, &ehci->regs->status) & STS_HALT)) { /* Give up after a few milliseconds */ if (ehci->died_poll_count++ < 5) { /* Try again later */ ehci_enable_event(ehci, EHCI_HRTIMER_POLL_DEAD, true); return; } ehci_warn(ehci, "Waited too long for the controller to stop, giving up\n"); } /* Clean up the mess */ ehci->rh_state = EHCI_RH_HALTED; ehci_writel(ehci, 0, &ehci->regs->configured_flag); ehci_writel(ehci, 0, &ehci->regs->intr_enable); ehci_work(ehci); end_unlink_async(ehci); /* Not in process context, so don't try to reset the controller */ } /* start to unlink interrupt QHs */ static void ehci_handle_start_intr_unlinks(struct ehci_hcd *ehci) { bool stopped = (ehci->rh_state < EHCI_RH_RUNNING); /* * Process all the QHs on the intr_unlink list that were added * before the current unlink cycle began. The list is in * temporal order, so stop when we reach the first entry in the * current cycle. But if the root hub isn't running then * process all the QHs on the list. */ while (!list_empty(&ehci->intr_unlink_wait)) { struct ehci_qh *qh; qh = list_first_entry(&ehci->intr_unlink_wait, struct ehci_qh, unlink_node); if (!stopped && (qh->unlink_cycle == ehci->intr_unlink_wait_cycle)) break; list_del_init(&qh->unlink_node); qh->unlink_reason |= QH_UNLINK_QUEUE_EMPTY; start_unlink_intr(ehci, qh); } /* Handle remaining entries later */ if (!list_empty(&ehci->intr_unlink_wait)) { ehci_enable_event(ehci, EHCI_HRTIMER_START_UNLINK_INTR, true); ++ehci->intr_unlink_wait_cycle; } } /* Handle unlinked interrupt QHs once they are gone from the hardware */ static void ehci_handle_intr_unlinks(struct ehci_hcd *ehci) { bool stopped = (ehci->rh_state < EHCI_RH_RUNNING); /* * Process all the QHs on the intr_unlink list that were added * before the current unlink cycle began. The list is in * temporal order, so stop when we reach the first entry in the * current cycle. But if the root hub isn't running then * process all the QHs on the list. */ ehci->intr_unlinking = true; while (!list_empty(&ehci->intr_unlink)) { struct ehci_qh *qh; qh = list_first_entry(&ehci->intr_unlink, struct ehci_qh, unlink_node); if (!stopped && qh->unlink_cycle == ehci->intr_unlink_cycle) break; list_del_init(&qh->unlink_node); end_unlink_intr(ehci, qh); } /* Handle remaining entries later */ if (!list_empty(&ehci->intr_unlink)) { ehci_enable_event(ehci, EHCI_HRTIMER_UNLINK_INTR, true); ++ehci->intr_unlink_cycle; } ehci->intr_unlinking = false; } /* Start another free-iTDs/siTDs cycle */ static void start_free_itds(struct ehci_hcd *ehci) { if (!(ehci->enabled_hrtimer_events & BIT(EHCI_HRTIMER_FREE_ITDS))) { ehci->last_itd_to_free = list_entry( ehci->cached_itd_list.prev, struct ehci_itd, itd_list); ehci->last_sitd_to_free = list_entry( ehci->cached_sitd_list.prev, struct ehci_sitd, sitd_list); ehci_enable_event(ehci, EHCI_HRTIMER_FREE_ITDS, true); } } /* Wait for controller to stop using old iTDs and siTDs */ static void end_free_itds(struct ehci_hcd *ehci) { struct ehci_itd *itd, *n; struct ehci_sitd *sitd, *sn; if (ehci->rh_state < EHCI_RH_RUNNING) { ehci->last_itd_to_free = NULL; ehci->last_sitd_to_free = NULL; } list_for_each_entry_safe(itd, n, &ehci->cached_itd_list, itd_list) { list_del(&itd->itd_list); dma_pool_free(ehci->itd_pool, itd, itd->itd_dma); if (itd == ehci->last_itd_to_free) break; } list_for_each_entry_safe(sitd, sn, &ehci->cached_sitd_list, sitd_list) { list_del(&sitd->sitd_list); dma_pool_free(ehci->sitd_pool, sitd, sitd->sitd_dma); if (sitd == ehci->last_sitd_to_free) break; } if (!list_empty(&ehci->cached_itd_list) || !list_empty(&ehci->cached_sitd_list)) start_free_itds(ehci); } /* Handle lost (or very late) IAA interrupts */ static void ehci_iaa_watchdog(struct ehci_hcd *ehci) { u32 cmd, status; /* * Lost IAA irqs wedge things badly; seen first with a vt8235. * So we need this watchdog, but must protect it against both * (a) SMP races against real IAA firing and retriggering, and * (b) clean HC shutdown, when IAA watchdog was pending. */ if (!ehci->iaa_in_progress || ehci->rh_state != EHCI_RH_RUNNING) return; /* If we get here, IAA is *REALLY* late. It's barely * conceivable that the system is so busy that CMD_IAAD * is still legitimately set, so let's be sure it's * clear before we read STS_IAA. (The HC should clear * CMD_IAAD when it sets STS_IAA.) */ cmd = ehci_readl(ehci, &ehci->regs->command); /* * If IAA is set here it either legitimately triggered * after the watchdog timer expired (_way_ late, so we'll * still count it as lost) ... or a silicon erratum: * - VIA seems to set IAA without triggering the IRQ; * - IAAD potentially cleared without setting IAA. */ status = ehci_readl(ehci, &ehci->regs->status); if ((status & STS_IAA) || !(cmd & CMD_IAAD)) { INCR(ehci->stats.lost_iaa); ehci_writel(ehci, STS_IAA, &ehci->regs->status); } ehci_dbg(ehci, "IAA watchdog: status %x cmd %x\n", status, cmd); end_iaa_cycle(ehci); } /* Enable the I/O watchdog, if appropriate */ static void turn_on_io_watchdog(struct ehci_hcd *ehci) { /* Not needed if the controller isn't running or it's already enabled */ if (ehci->rh_state != EHCI_RH_RUNNING || (ehci->enabled_hrtimer_events & BIT(EHCI_HRTIMER_IO_WATCHDOG))) return; /* * Isochronous transfers always need the watchdog. * For other sorts we use it only if the flag is set. */ if (ehci->isoc_count > 0 || (ehci->need_io_watchdog && ehci->async_count + ehci->intr_count > 0)) ehci_enable_event(ehci, EHCI_HRTIMER_IO_WATCHDOG, true); } /* * Handler functions for the hrtimer event types. * Keep this array in the same order as the event types indexed by * enum ehci_hrtimer_event in ehci.h. */ static void (*event_handlers[])(struct ehci_hcd *) = { ehci_poll_ASS, /* EHCI_HRTIMER_POLL_ASS */ ehci_poll_PSS, /* EHCI_HRTIMER_POLL_PSS */ ehci_handle_controller_death, /* EHCI_HRTIMER_POLL_DEAD */ ehci_handle_intr_unlinks, /* EHCI_HRTIMER_UNLINK_INTR */ end_free_itds, /* EHCI_HRTIMER_FREE_ITDS */ end_unlink_async, /* EHCI_HRTIMER_ACTIVE_UNLINK */ ehci_handle_start_intr_unlinks, /* EHCI_HRTIMER_START_UNLINK_INTR */ unlink_empty_async, /* EHCI_HRTIMER_ASYNC_UNLINKS */ ehci_iaa_watchdog, /* EHCI_HRTIMER_IAA_WATCHDOG */ ehci_disable_PSE, /* EHCI_HRTIMER_DISABLE_PERIODIC */ ehci_disable_ASE, /* EHCI_HRTIMER_DISABLE_ASYNC */ ehci_work, /* EHCI_HRTIMER_IO_WATCHDOG */ }; static enum hrtimer_restart ehci_hrtimer_func(struct hrtimer *t) { struct ehci_hcd *ehci = container_of(t, struct ehci_hcd, hrtimer); ktime_t now; unsigned long events; unsigned long flags; unsigned e; spin_lock_irqsave(&ehci->lock, flags); events = ehci->enabled_hrtimer_events; ehci->enabled_hrtimer_events = 0; ehci->next_hrtimer_event = EHCI_HRTIMER_NO_EVENT; /* * Check each pending event. If its time has expired, handle * the event; otherwise re-enable it. */ now = ktime_get(); for_each_set_bit(e, &events, EHCI_HRTIMER_NUM_EVENTS) { if (ktime_compare(now, ehci->hr_timeouts[e]) >= 0) event_handlers[e](ehci); else ehci_enable_event(ehci, e, false); } spin_unlock_irqrestore(&ehci->lock, flags); return HRTIMER_NORESTART; }
37 37 37 37 37 18 18 18 18 18 18 18 18 30 30 2 8 8 8 8 8 8 8 8 4 8 6 5 6 8 8 8 7 2 6 5 3 8 2 2 1 2 14 14 14 3 3 3 3 3 3 3 3 7 7 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer FIFO * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> */ #include <sound/core.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include "seq_fifo.h" #include "seq_lock.h" /* FIFO */ /* create new fifo */ struct snd_seq_fifo *snd_seq_fifo_new(int poolsize) { struct snd_seq_fifo *f; f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return NULL; f->pool = snd_seq_pool_new(poolsize); if (f->pool == NULL) { kfree(f); return NULL; } if (snd_seq_pool_init(f->pool) < 0) { snd_seq_pool_delete(&f->pool); kfree(f); return NULL; } spin_lock_init(&f->lock); snd_use_lock_init(&f->use_lock); init_waitqueue_head(&f->input_sleep); atomic_set(&f->overflow, 0); f->head = NULL; f->tail = NULL; f->cells = 0; return f; } void snd_seq_fifo_delete(struct snd_seq_fifo **fifo) { struct snd_seq_fifo *f; if (snd_BUG_ON(!fifo)) return; f = *fifo; if (snd_BUG_ON(!f)) return; *fifo = NULL; if (f->pool) snd_seq_pool_mark_closing(f->pool); snd_seq_fifo_clear(f); /* wake up clients if any */ if (waitqueue_active(&f->input_sleep)) wake_up(&f->input_sleep); /* release resources...*/ /*....................*/ if (f->pool) { snd_seq_pool_done(f->pool); snd_seq_pool_delete(&f->pool); } kfree(f); } static struct snd_seq_event_cell *fifo_cell_out(struct snd_seq_fifo *f); /* clear queue */ void snd_seq_fifo_clear(struct snd_seq_fifo *f) { struct snd_seq_event_cell *cell; /* clear overflow flag */ atomic_set(&f->overflow, 0); snd_use_lock_sync(&f->use_lock); guard(spinlock_irq)(&f->lock); /* drain the fifo */ while ((cell = fifo_cell_out(f)) != NULL) { snd_seq_cell_free(cell); } } /* enqueue event to fifo */ int snd_seq_fifo_event_in(struct snd_seq_fifo *f, struct snd_seq_event *event) { struct snd_seq_event_cell *cell; int err; if (snd_BUG_ON(!f)) return -EINVAL; guard(snd_seq_fifo)(f); err = snd_seq_event_dup(f->pool, event, &cell, 1, NULL, NULL); /* always non-blocking */ if (err < 0) { if ((err == -ENOMEM) || (err == -EAGAIN)) atomic_inc(&f->overflow); return err; } /* append new cells to fifo */ scoped_guard(spinlock_irqsave, &f->lock) { if (f->tail != NULL) f->tail->next = cell; f->tail = cell; if (f->head == NULL) f->head = cell; cell->next = NULL; f->cells++; } /* wakeup client */ if (waitqueue_active(&f->input_sleep)) wake_up(&f->input_sleep); return 0; /* success */ } /* dequeue cell from fifo */ static struct snd_seq_event_cell *fifo_cell_out(struct snd_seq_fifo *f) { struct snd_seq_event_cell *cell; cell = f->head; if (cell) { f->head = cell->next; /* reset tail if this was the last element */ if (f->tail == cell) f->tail = NULL; cell->next = NULL; f->cells--; } return cell; } /* dequeue cell from fifo and copy on user space */ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f, struct snd_seq_event_cell **cellp, int nonblock) { struct snd_seq_event_cell *cell; unsigned long flags; wait_queue_entry_t wait; if (snd_BUG_ON(!f)) return -EINVAL; *cellp = NULL; init_waitqueue_entry(&wait, current); spin_lock_irqsave(&f->lock, flags); while ((cell = fifo_cell_out(f)) == NULL) { if (nonblock) { /* non-blocking - return immediately */ spin_unlock_irqrestore(&f->lock, flags); return -EAGAIN; } set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&f->input_sleep, &wait); spin_unlock_irqrestore(&f->lock, flags); schedule(); spin_lock_irqsave(&f->lock, flags); remove_wait_queue(&f->input_sleep, &wait); if (signal_pending(current)) { spin_unlock_irqrestore(&f->lock, flags); return -ERESTARTSYS; } } spin_unlock_irqrestore(&f->lock, flags); *cellp = cell; return 0; } void snd_seq_fifo_cell_putback(struct snd_seq_fifo *f, struct snd_seq_event_cell *cell) { if (cell) { guard(spinlock_irqsave)(&f->lock); cell->next = f->head; f->head = cell; if (!f->tail) f->tail = cell; f->cells++; } } /* polling; return non-zero if queue is available */ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table *wait) { poll_wait(file, &f->input_sleep, wait); guard(spinlock_irq)(&f->lock); return (f->cells > 0); } /* change the size of pool; all old events are removed */ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize) { struct snd_seq_pool *newpool, *oldpool; struct snd_seq_event_cell *cell, *next, *oldhead; if (snd_BUG_ON(!f || !f->pool)) return -EINVAL; /* allocate new pool */ newpool = snd_seq_pool_new(poolsize); if (newpool == NULL) return -ENOMEM; if (snd_seq_pool_init(newpool) < 0) { snd_seq_pool_delete(&newpool); return -ENOMEM; } scoped_guard(spinlock_irq, &f->lock) { /* remember old pool */ oldpool = f->pool; oldhead = f->head; /* exchange pools */ f->pool = newpool; f->head = NULL; f->tail = NULL; f->cells = 0; /* NOTE: overflow flag is not cleared */ } /* close the old pool and wait until all users are gone */ snd_seq_pool_mark_closing(oldpool); snd_use_lock_sync(&f->use_lock); /* release cells in old pool */ for (cell = oldhead; cell; cell = next) { next = cell->next; snd_seq_cell_free(cell); } snd_seq_pool_delete(&oldpool); return 0; } /* get the number of unused cells safely */ int snd_seq_fifo_unused_cells(struct snd_seq_fifo *f) { if (!f) return 0; guard(snd_seq_fifo)(f); guard(spinlock_irqsave)(&f->lock); return snd_seq_unused_cells(f->pool); }
12 12 8 71 71 71 71 12 11 11 10 8 12 88 88 86 87 87 88 389 390 388 309 307 306 385 74 385 5 157 159 158 153 152 151 154 6 155 2 84 84 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/compat.c * * Kernel compatibililty routines for e.g. 32 bit syscall support * on 64 bit kernels. * * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation */ #include <linux/linkage.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/signal.h> #include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/security.h> #include <linux/export.h> #include <linux/migrate.h> #include <linux/posix-timers.h> #include <linux/times.h> #include <linux/ptrace.h> #include <linux/gfp.h> #include <linux/uaccess.h> #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the * blocked set of signals to the supplied signal set */ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) { memcpy(blocked->sig, &set, sizeof(set)); } COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, compat_old_sigset_t __user *, nset, compat_old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; old_set = current->blocked.sig[0]; if (nset) { if (get_user(new_set, nset)) return -EFAULT; new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); new_blocked = current->blocked; switch (how) { case SIG_BLOCK: sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: compat_sig_setmask(&new_blocked, new_set); break; default: return -EINVAL; } set_current_blocked(&new_blocked); } if (oset) { if (put_user(old_set, oset)) return -EFAULT; } return 0; } #endif int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { struct compat_rusage r32; memset(&r32, 0, sizeof(r32)); r32.ru_utime.tv_sec = r->ru_utime.tv_sec; r32.ru_utime.tv_usec = r->ru_utime.tv_usec; r32.ru_stime.tv_sec = r->ru_stime.tv_sec; r32.ru_stime.tv_usec = r->ru_stime.tv_usec; r32.ru_maxrss = r->ru_maxrss; r32.ru_ixrss = r->ru_ixrss; r32.ru_idrss = r->ru_idrss; r32.ru_isrss = r->ru_isrss; r32.ru_minflt = r->ru_minflt; r32.ru_majflt = r->ru_majflt; r32.ru_nswap = r->ru_nswap; r32.ru_inblock = r->ru_inblock; r32.ru_oublock = r->ru_oublock; r32.ru_msgsnd = r->ru_msgsnd; r32.ru_msgrcv = r->ru_msgrcv; r32.ru_nsignals = r->ru_nsignals; r32.ru_nvcsw = r->ru_nvcsw; r32.ru_nivcsw = r->ru_nivcsw; if (copy_to_user(ru, &r32, sizeof(r32))) return -EFAULT; return 0; } static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) { unsigned long *k; if (len < cpumask_size()) memset(new_mask, 0, cpumask_size()); else if (len > cpumask_size()) len = cpumask_size(); k = cpumask_bits(new_mask); return compat_get_bitmap(k, user_mask_ptr, len * 8); } COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid, unsigned int, len, compat_ulong_t __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); if (retval) goto out; retval = sched_setaffinity(pid, new_mask); out: free_cpumask_var(new_mask); return retval; } COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, compat_ulong_t __user *, user_mask_ptr) { int ret; cpumask_var_t mask; if ((len * BITS_PER_BYTE) < nr_cpu_ids) return -EINVAL; if (len & (sizeof(compat_ulong_t)-1)) return -EINVAL; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { unsigned int retlen = min(len, cpumask_size()); if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) ret = -EFAULT; else ret = retlen; } free_cpumask_var(mask); return ret; } /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes * sigev_notify_thread_id). The others are handled in user mode. * We also assume that copying sigev_value.sival_int is sufficient * to keep all the bits of sigev_value.sival_ptr intact. */ int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event) { memset(event, 0, sizeof(*event)); return (!access_ok(u_event, sizeof(*u_event)) || __get_user(event->sigev_value.sival_int, &u_event->sigev_value.sival_int) || __get_user(event->sigev_signo, &u_event->sigev_signo) || __get_user(event->sigev_notify, &u_event->sigev_notify) || __get_user(event->sigev_notify_thread_id, &u_event->sigev_notify_thread_id)) ? -EFAULT : 0; } long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { compat_ulong_t l1, l2; unsafe_get_user(l1, umask++, Efault); unsafe_get_user(l2, umask++, Efault); *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1; nr_compat_longs -= 2; } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); user_read_access_end(); return 0; Efault: user_read_access_end(); return -EFAULT; } long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size) { unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { unsigned long m = *mask++; unsafe_put_user((compat_ulong_t)m, umask++, Efault); unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault); nr_compat_longs -= 2; } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); user_write_access_end(); return 0; Efault: user_write_access_end(); return -EFAULT; } int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) { #ifdef __BIG_ENDIAN compat_sigset_t v; if (copy_from_user(&v, compat, sizeof(compat_sigset_t))) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); fallthrough; case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); fallthrough; case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); fallthrough; case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else if (copy_from_user(set, compat, sizeof(compat_sigset_t))) return -EFAULT; #endif return 0; } EXPORT_SYMBOL_GPL(get_compat_sigset);
23 23 23 17 17 23 23 23 17 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0-or-later /* * Poly1305 authenticator algorithm, RFC7539 * * Copyright (C) 2015 Martin Willi * * Based on public domain code by Andrew Moon and Daniel J. Bernstein. */ #include <crypto/internal/poly1305.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> #include <linux/unaligned.h> #ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH #include "poly1305.h" /* $(SRCARCH)/poly1305.h */ #else #define poly1305_block_init poly1305_block_init_generic #define poly1305_blocks poly1305_blocks_generic #define poly1305_emit poly1305_emit_generic #endif void poly1305_init(struct poly1305_desc_ctx *desc, const u8 key[POLY1305_KEY_SIZE]) { desc->s[0] = get_unaligned_le32(key + 16); desc->s[1] = get_unaligned_le32(key + 20); desc->s[2] = get_unaligned_le32(key + 24); desc->s[3] = get_unaligned_le32(key + 28); desc->buflen = 0; poly1305_block_init(&desc->state, key); } EXPORT_SYMBOL(poly1305_init); void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes) { if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) { unsigned int bulk_len; if (desc->buflen) { unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen; memcpy(&desc->buf[desc->buflen], src, l); src += l; nbytes -= l; poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE, 1); desc->buflen = 0; } bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE); nbytes %= POLY1305_BLOCK_SIZE; if (bulk_len) { poly1305_blocks(&desc->state, src, bulk_len, 1); src += bulk_len; } } if (nbytes) { memcpy(&desc->buf[desc->buflen], src, nbytes); desc->buflen += nbytes; } } EXPORT_SYMBOL(poly1305_update); void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst) { if (unlikely(desc->buflen)) { desc->buf[desc->buflen++] = 1; memset(desc->buf + desc->buflen, 0, POLY1305_BLOCK_SIZE - desc->buflen); poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE, 0); } poly1305_emit(&desc->state.h, dst, desc->s); *desc = (struct poly1305_desc_ctx){}; } EXPORT_SYMBOL(poly1305_final); #ifdef poly1305_mod_init_arch static int __init poly1305_mod_init(void) { poly1305_mod_init_arch(); return 0; } subsys_initcall(poly1305_mod_init); static void __exit poly1305_mod_exit(void) { } module_exit(poly1305_mod_exit); #endif MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539");
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H #include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> #include <linux/poll.h> #include <linux/cgroup-defs.h> #include <linux/cgroup.h> struct seq_file; struct css_set; #ifdef CONFIG_PSI extern struct static_key_false psi_disabled; extern struct psi_group psi_system; void psi_init(void); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, enum psi_res res, struct file *file, struct kernfs_open_file *of); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; } int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); void psi_cgroup_restart(struct psi_group *group); #endif #else /* CONFIG_PSI */ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} #ifdef CONFIG_CGROUPS static inline int psi_cgroup_alloc(struct cgroup *cgrp) { return 0; } static inline void psi_cgroup_free(struct cgroup *cgrp) { } static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) { rcu_assign_pointer(p->cgroups, to); } static inline void psi_cgroup_restart(struct psi_group *group) {} #endif #endif /* CONFIG_PSI */ #endif /* _LINUX_PSI_H */
1 1 1 1 1 1 1 1 1 1 1 1 526 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/auth.c * * Generic RPC client authentication API. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/types.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/hash.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/spinlock.h> #include <trace/events/sunrpc.h> #define RPC_CREDCACHE_DEFAULT_HASHBITS (4) struct rpc_cred_cache { struct hlist_head *hashtable; unsigned int hashbits; spinlock_t lock; }; static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, [RPC_AUTH_TLS] = (const struct rpc_authops __force __rcu *)&authtls_ops, }; static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), }; /* * Return the machine_cred pointer to be used whenever * the a generic machine credential is needed. */ const struct cred *rpc_machine_cred(void) { return &machine_cred; } EXPORT_SYMBOL_GPL(rpc_machine_cred); #define MAX_HASHTABLE_BITS (14) static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) { unsigned long num; unsigned int nbits; int ret; if (!val) goto out_inval; ret = kstrtoul(val, 0, &num); if (ret) goto out_inval; nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; return 0; out_inval: return -EINVAL; } static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) { unsigned int nbits; nbits = *(unsigned int *)kp->arg; return sprintf(buffer, "%u\n", 1U << nbits); } #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); static const struct kernel_param_ops param_ops_hashtbl_sz = { .set = param_set_hashtbl_sz, .get = param_get_hashtbl_sz, }; module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); static unsigned long auth_max_cred_cachesize = ULONG_MAX; module_param(auth_max_cred_cachesize, ulong, 0644); MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size"); static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor > RPC_AUTH_MAXFLAVOR) return RPC_AUTH_GSS; return flavor; } int rpcauth_register(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); if (old == NULL || old == ops) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); if (old == ops || old == NULL) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); static const struct rpc_authops * rpcauth_get_authops(rpc_authflavor_t flavor) { const struct rpc_authops *ops; if (flavor >= RPC_AUTH_MAXFLAVOR) return NULL; rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) { rcu_read_unlock(); request_module("rpc-auth-%u", flavor); rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) goto out; } if (!try_module_get(ops->owner)) ops = NULL; out: rcu_read_unlock(); return ops; } static void rpcauth_put_authops(const struct rpc_authops *ops) { module_put(ops->owner); } /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor * @info: a GSS mech OID, quality of protection, and service value * * Verifies that an appropriate kernel module is available or already loaded. * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is * not supported locally. */ rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; if (!ops) return RPC_AUTH_MAXFLAVOR; pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); /** * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor * @pseudoflavor: GSS pseudoflavor to match * @info: rpcsec_gss_info structure to fill in * * Returns zero and fills in "info" if pseudoflavor matches a * supported mechanism. */ int rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) { rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor); const struct rpc_authops *ops; int result; ops = rpcauth_get_authops(flavor); if (ops == NULL) return -ENOENT; result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); ops = rpcauth_get_authops(flavor); if (ops == NULL) goto out; auth = ops->create(args, clnt); rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) rpcauth_release(clnt->cl_auth); clnt->cl_auth = auth; out: return auth; } EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); /* * On success, the caller is responsible for freeing the reference * held by the hashtable */ static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; hlist_del_rcu(&cred->cr_hash); return true; } static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; bool ret; if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } /* * Initialize RPC credential cache */ int rpcauth_init_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *new; unsigned int hashsize; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out_nocache; new->hashbits = auth_hashbits; hashsize = 1U << new->hashbits; new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); if (!new->hashtable) goto out_nohashtbl; spin_lock_init(&new->lock); auth->au_credcache = new; return 0; out_nohashtbl: kfree(new); out_nocache: return -ENOMEM; } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); char * rpcauth_stringify_acceptor(struct rpc_cred *cred) { if (!cred->cr_ops->crstringify_acceptor) return NULL; return cred->cr_ops->crstringify_acceptor(cred); } EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor); /* * Destroy a list of credentials */ static inline void rpcauth_destroy_credlist(struct list_head *head) { struct rpc_cred *cred; while (!list_empty(head)) { cred = list_entry(head->next, struct rpc_cred, cr_lru); list_del_init(&cred->cr_lru); put_rpccred(cred); } } static void rpcauth_lru_add_locked(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; number_cred_unused++; list_add_tail(&cred->cr_lru, &cred_unused); } static void rpcauth_lru_add(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_add_locked(cred); spin_unlock(&rpc_credcache_lock); } static void rpcauth_lru_remove_locked(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; number_cred_unused--; list_del_init(&cred->cr_lru); } static void rpcauth_lru_remove(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_remove_locked(cred); spin_unlock(&rpc_credcache_lock); } /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. */ void rpcauth_clear_credcache(struct rpc_cred_cache *cache) { LIST_HEAD(free); struct hlist_head *head; struct rpc_cred *cred; unsigned int hashsize = 1U << cache->hashbits; int i; spin_lock(&rpc_credcache_lock); spin_lock(&cache->lock); for (i = 0; i < hashsize; i++) { head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); rpcauth_unhash_cred_locked(cred); /* Note: We now hold a reference to cred */ rpcauth_lru_remove_locked(cred); list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); } /* * Destroy the RPC credential cache */ void rpcauth_destroy_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *cache = auth->au_credcache; if (cache) { auth->au_credcache = NULL; rpcauth_clear_credcache(cache); kfree(cache->hashtable); kfree(cache); } } EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); #define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) /* * Remove stale credentials. Avoid sleeping inside the loop. */ static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { if (nr_to_scan-- == 0) break; if (refcount_read(&cred->cr_count) > 1) { rpcauth_lru_remove_locked(cred); continue; } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; rpcauth_lru_remove_locked(cred); freed++; list_add_tail(&cred->cr_lru, free); } return freed ? freed : SHRINK_STOP; } static unsigned long rpcauth_cache_do_shrink(int nr_to_scan) { LIST_HEAD(free); unsigned long freed; spin_lock(&rpc_credcache_lock); freed = rpcauth_prune_expired(&free, nr_to_scan); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); return freed; } /* * Run memory cache shrinker. */ static unsigned long rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; /* nothing left, don't come back */ if (list_empty(&cred_unused)) return SHRINK_STOP; return rpcauth_cache_do_shrink(sc->nr_to_scan); } static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return number_cred_unused; } static void rpcauth_cache_enforce_limit(void) { unsigned long diff; unsigned int nr_to_scan; if (number_cred_unused <= auth_max_cred_cachesize) return; diff = number_cred_unused - auth_max_cred_cachesize; nr_to_scan = 100; if (diff < nr_to_scan) nr_to_scan = diff; rpcauth_cache_do_shrink(nr_to_scan); } /* * Look up a process' credentials in the authentication cache */ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, int flags, gfp_t gfp) { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; struct rpc_cred *cred = NULL, *entry, *new; unsigned int nr; nr = auth->au_ops->hash_cred(acred, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } rcu_read_unlock(); if (cred != NULL) goto found; new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; goto out; } spin_lock(&cache->lock); hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); rpcauth_cache_enforce_limit(); found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL && !(flags & RPCAUTH_LOOKUP_NEW)) { int res = cred->cr_ops->cr_init(auth, cred); if (res < 0) { put_rpccred(cred); cred = ERR_PTR(res); } } rpcauth_destroy_credlist(&free); out: return cred; } EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *auth, int flags) { struct auth_cred acred; struct rpc_cred *ret; const struct cred *cred = current_cred(); memset(&acred, 0, sizeof(acred)); acred.cred = cred; ret = auth->au_ops->lookup_cred(auth, &acred, flags); return ret; } EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, struct rpc_auth *auth, const struct rpc_credops *ops) { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_flags = 0; cred->cr_ops = ops; cred->cr_expire = jiffies; cred->cr_cred = get_cred(acred->cred); } EXPORT_SYMBOL_GPL(rpcauth_init_cred); static struct rpc_cred * rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = get_task_cred(&init_task), }; struct rpc_cred *ret; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); put_cred(acred.cred); return ret; } static struct rpc_cred * rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .principal = task->tk_client->cl_principal, .cred = init_task.cred, }; if (!acred.principal) return NULL; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } static struct rpc_cred * rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; return rpcauth_lookupcred(auth, lookupflags); } static int rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *new = NULL; int lookupflags = 0; struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = cred, }; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC; if (task->tk_op_cred) /* Task must use exactly this rpc_cred */ new = get_rpccred(task->tk_op_cred); else if (cred != NULL && cred != &machine_cred) new = auth->au_ops->lookup_cred(auth, &acred, lookupflags); else if (cred == &machine_cred) new = rpcauth_bind_machine_cred(task, lookupflags); /* If machine cred couldn't be bound, try a root cred */ if (new) ; else if (cred == &machine_cred) new = rpcauth_bind_root_cred(task, lookupflags); else if (flags & RPC_TASK_NULLCREDS) new = authnull_ops.lookup_cred(NULL, NULL, 0); else new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) return PTR_ERR(new); put_rpccred(req->rq_cred); req->rq_cred = new; return 0; } void put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; rcu_read_lock(); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; if (refcount_read(&cred->cr_count) != 1 || !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) goto out; if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { cred->cr_expire = jiffies; rpcauth_lru_add(cred); /* Race breaker */ if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) rpcauth_lru_remove(cred); } else if (rpcauth_unhash_cred(cred)) { rpcauth_lru_remove(cred); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; } out: rcu_read_unlock(); return; destroy: rcu_read_unlock(); cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); /** * rpcauth_marshcred - Append RPC credential to end of @xdr * @task: controlling RPC task * @xdr: xdr_stream containing initial portion of RPC Call header * * On success, an appropriate verifier is added to @xdr, @xdr is * updated to point past the verifier, and zero is returned. * Otherwise, @xdr is in an undefined state and a negative errno * is returned. */ int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crmarshal(task, xdr); } /** * rpcauth_wrap_req_encode - XDR encode the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message. * Otherwise, @xdr is in an undefined state. */ int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr) { kxdreproc_t encode = task->tk_msg.rpc_proc->p_encode; encode(task->tk_rqstp, xdr, task->tk_msg.rpc_argp); return 0; } EXPORT_SYMBOL_GPL(rpcauth_wrap_req_encode); /** * rpcauth_wrap_req - XDR encode and wrap the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message, * and zero is returned. Otherwise, @xdr is in an undefined * state and a negative errno is returned. */ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crwrap_req(task, xdr); } /** * rpcauth_checkverf - Validate verifier in RPC Reply header * @task: controlling RPC task * @xdr: xdr_stream containing RPC Reply header * * Return values: * %0: Verifier is valid. @xdr now points past the verifier. * %-EIO: Verifier is corrupted or message ended early. * %-EACCES: Verifier is intact but not valid. * %-EPROTONOSUPPORT: Server does not support the requested auth type. * * When a negative errno is returned, @xdr is left in an undefined * state. */ int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crvalidate(task, xdr); } /** * rpcauth_unwrap_resp_decode - Invoke XDR decode function * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr) { kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode; return decode(task->tk_rqstp, xdr, task->tk_msg.rpc_resp); } EXPORT_SYMBOL_GPL(rpcauth_unwrap_resp_decode); /** * rpcauth_unwrap_resp - Invoke unwrap and decode function for the cred * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crunwrap_resp(task, xdr); } bool rpcauth_xmit_need_reencode(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (!cred || !cred->cr_ops->crneed_reencode) return false; return cred->cr_ops->crneed_reencode(task); } int rpcauth_refreshcred(struct rpc_task *task) { struct rpc_cred *cred; int err; cred = task->tk_rqstp->rq_cred; if (cred == NULL) { err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags); if (err < 0) goto out; cred = task->tk_rqstp->rq_cred; } err = cred->cr_ops->crrefresh(task); out: if (err < 0) task->tk_status = err; return err; } void rpcauth_invalcred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (cred) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); } int rpcauth_uptodatecred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; return cred == NULL || test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { int err; err = rpc_init_authunix(); if (err < 0) goto out1; rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); if (!rpc_cred_shrinker) { err = -ENOMEM; goto out2; } rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; shrinker_register(rpc_cred_shrinker); return 0; out2: rpc_destroy_authunix(); out1: return err; } void rpcauth_remove_module(void) { rpc_destroy_authunix(); shrinker_free(rpc_cred_shrinker); }
7 1 22 34 75 75 34 15 41 40 2 2 1 14 107 1898 1898 2 76 5 5 1409 1068 14 296 38 336 427 201 197 145 201 900 875 83 72 126 21 45 34 85 2003 2001 1936 1934 34 31 11 9 14 1679 115 2301 29 37 3 744 2 2562 2243 1 119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_H #define _LINUX_BPF_H 1 #include <uapi/linux/bpf.h> #include <uapi/linux/filter.h> #include <crypto/sha2.h> #include <linux/workqueue.h> #include <linux/file.h> #include <linux/percpu.h> #include <linux/err.h> #include <linux/rbtree_latch.h> #include <linux/numa.h> #include <linux/mm_types.h> #include <linux/wait.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/capability.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> #include <linux/stddef.h> #include <linux/bpfptr.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/static_call.h> #include <linux/memcontrol.h> #include <linux/cfi.h> #include <asm/rqspinlock.h> struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; struct bpf_prog_aux; struct bpf_map; struct bpf_arena; struct sock; struct seq_file; struct btf; struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; struct bpf_token; struct user_namespace; struct super_block; struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); typedef unsigned int (*bpf_func_t)(const void *, const struct bpf_insn *); struct bpf_iter_seq_info { const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; }; /* map is generic key/value storage optionally accessible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_update_batch)(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); long (*map_delete_elem)(struct bpf_map *map, void *key); long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); /* If need_defer is true, the implementation should guarantee that * the to-be-put element is still alive before the bpf program, which * may manipulate it, exists. */ void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); /* Prog poke tracking helpers. */ int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new); /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); /* Functions called by bpf_local_storage maps */ int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, void *owner, u32 size); void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. * * Some properties of the inner map has been used during the * verification time. When inserting an inner map at the runtime, * map_meta_equal has to ensure the inserting map has the same * properties that the verifier has used earlier. */ bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); u64 (*map_mem_usage)(const struct bpf_map *map); /* BTF id of struct allocated by map_alloc */ int *map_btf_id; /* bpf_iter info used to open a seq_file */ const struct bpf_iter_seq_info *iter_seq_info; }; enum { /* Support at most 11 fields in a BTF type */ BTF_FIELDS_MAX = 11, }; enum btf_field_type { BPF_SPIN_LOCK = (1 << 0), BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR_PERCPU = (1 << 4), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU, BPF_LIST_HEAD = (1 << 5), BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, BPF_CGROUP_STORAGE_PERCPU, __BPF_CGROUP_STORAGE_MAX #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX }; #ifdef CONFIG_CGROUP_BPF # define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) #else # define for_each_cgroup_storage_type(stype) for (; false; ) #endif /* CONFIG_CGROUP_BPF */ typedef void (*btf_dtor_kfunc_t)(void *); struct btf_field_kptr { struct btf *btf; struct module *module; /* dtor used if btf_is_kernel(btf), otherwise the type is * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used */ btf_dtor_kfunc_t dtor; u32 btf_id; }; struct btf_field_graph_root { struct btf *btf; u32 value_btf_id; u32 node_offset; struct btf_record *value_rec; }; struct btf_field { u32 offset; u32 size; enum btf_field_type type; union { struct btf_field_kptr kptr; struct btf_field_graph_root graph_root; }; }; struct btf_record { u32 cnt; u32 field_mask; int spin_lock_off; int res_spin_lock_off; int timer_off; int wq_off; int refcount_off; int task_work_off; struct btf_field fields[]; }; /* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ struct bpf_rb_node_kern { struct rb_node rb_node; void *owner; } __attribute__((aligned(8))); /* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ struct bpf_list_node_kern { struct list_head list_head; void *owner; } __attribute__((aligned(8))); /* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is * stored in the map to make sure that all callers and callees have the * same prog type, JITed flag and xdp_has_frags flag. */ struct bpf_map_owner { enum bpf_prog_type type; bool jited; bool xdp_has_frags; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; enum bpf_attach_type expected_attach_type; }; struct bpf_map { u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; #endif enum bpf_map_type map_type; u32 key_size; u32 value_size; u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; u32 id; struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; struct btf *btf; #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; struct mutex freeze_mutex; atomic64_t refcnt; atomic64_t usercnt; /* rcu is used before freeing and work is only used during freeing */ union { struct work_struct work; struct rcu_head rcu; }; atomic64_t writecnt; spinlock_t owner_lock; struct bpf_map_owner *owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; bool free_after_rcu_gp; atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return "bpf_spin_lock"; case BPF_RES_SPIN_LOCK: return "bpf_res_spin_lock"; case BPF_TIMER: return "bpf_timer"; case BPF_WORKQUEUE: return "bpf_wq"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; case BPF_KPTR_PERCPU: return "percpu_kptr"; case BPF_UPTR: return "uptr"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: return "bpf_list_node"; case BPF_RB_ROOT: return "bpf_rb_root"; case BPF_RB_NODE: return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; case BPF_TASK_WORK: return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; } } #if IS_ENABLED(CONFIG_DEBUG_KERNEL) #define BPF_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) #else #define BPF_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return sizeof(struct bpf_spin_lock); case BPF_RES_SPIN_LOCK: return sizeof(struct bpf_res_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); case BPF_WORKQUEUE: return sizeof(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); case BPF_LIST_NODE: return sizeof(struct bpf_list_node); case BPF_RB_ROOT: return sizeof(struct bpf_rb_root); case BPF_RB_NODE: return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); case BPF_TASK_WORK: return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; } } static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { case BPF_SPIN_LOCK: return __alignof__(struct bpf_spin_lock); case BPF_RES_SPIN_LOCK: return __alignof__(struct bpf_res_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); case BPF_WORKQUEUE: return __alignof__(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); case BPF_LIST_NODE: return __alignof__(struct bpf_list_node); case BPF_RB_ROOT: return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); case BPF_TASK_WORK: return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; } } static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) { memset(addr, 0, field->size); switch (field->type) { case BPF_REFCOUNT: refcount_set((refcount_t *)addr, 1); break; case BPF_RB_NODE: RB_CLEAR_NODE((struct rb_node *)addr); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: INIT_LIST_HEAD((struct list_head *)addr); break; case BPF_RB_ROOT: /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); return; } } static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) { if (IS_ERR_OR_NULL(rec)) return false; return rec->field_mask & type; } static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset); } /* 'dst' must be a temporary buffer and should not point to memory that is being * used in parallel by a bpf program or bpf syscall, otherwise the access from * the bpf program or bpf syscall may be corrupted by the reinitialization, * leading to weird problems. Even 'dst' is newly-allocated from bpf memory * allocator, it is still possible for 'dst' to be used in parallel by a bpf * program or bpf syscall. */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { bpf_obj_init(map->record, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent * updates from BPF programs. Called from bpf syscall and mostly used with * size 8 or 16 bytes, so ask compiler to inline it. */ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) { const long *lsrc = src; long *ldst = dst; size /= sizeof(long); while (size--) data_race(*ldst++ = *lsrc++); } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ static inline void bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy) { u32 curr_off = 0; int i; if (IS_ERR_OR_NULL(rec)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(size, 8)); else memcpy(dst, src, size); return; } for (i = 0; i < rec->cnt; i++) { u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); curr_off += rec->fields[i].size + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { bpf_obj_memcpy(map->record, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } static inline void bpf_obj_swap_uptrs(const struct btf_record *rec, void *dst, void *src) { unsigned long *src_uptr, *dst_uptr; const struct btf_field *field; int i; if (!btf_record_has_field(rec, BPF_UPTR)) return; for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { if (field->type != BPF_UPTR) continue; src_uptr = src + field->offset; dst_uptr = dst + field->offset; swap(*src_uptr, *dst_uptr); } } static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; int i; if (IS_ERR_OR_NULL(rec)) { memset(dst, 0, size); return; } for (i = 0; i < rec->cnt; i++) { u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); curr_off += rec->fields[i].size + sz; } memset(dst + curr_off, 0, size - curr_off); } static inline void zero_map_value(struct bpf_map *map, void *dst) { bpf_obj_memzero(map->record, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { int (*map_get_next_key)(struct bpf_offloaded_map *map, void *key, void *next_key); int (*map_lookup_elem)(struct bpf_offloaded_map *map, void *key, void *value); int (*map_update_elem)(struct bpf_offloaded_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); }; struct bpf_offloaded_map { struct bpf_map map; struct net_device *netdev; const struct bpf_map_dev_ops *dev_ops; void *dev_priv; struct list_head offloads; }; static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) { return container_of(map, struct bpf_offloaded_map, map); } static inline bool bpf_map_offload_neutral(const struct bpf_map *map) { return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; } static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) && map->ops->map_seq_show_elem; } int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); static inline bool bpf_map_has_internal_structs(struct bpf_map *map) { return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK); } void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit); extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of * arg_type, ret_type and reg_type. For example, a pointer value may be null, * or a memory is read-only. We classify types into two categories: base types * and extended types. Extended types are base types combined with a type flag. * * Currently there are no more than 32 base types in arg_type, ret_type and * reg_types. */ #define BPF_BASE_TYPE_BITS 8 enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), /* MEM is read-only. When applied on bpf_arg, it indicates the arg is * compatible with both mutable and immutable memory. */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), /* MEM points to BPF ring buffer reservation. */ MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS), /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In * order to drop this tag, it must be passed into bpf_per_cpu_ptr() * or bpf_this_cpu_ptr(), which will return the pointer corresponding * to the specified cpu. */ MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), /* Indicates that the argument will be released. */ OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark * unreferenced and referenced kptr loaded from map value using a load * instruction, so that they can only be dereferenced but not escape the * BPF program into the kernel (i.e. cannot be passed as arguments to * kfunc or bpf helpers). */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), /* MEM can be uninitialized. */ MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), /* MEM is of an allocated object of type in program BTF. This is used to * tag PTR_TO_BTF_ID allocated using bpf_obj_new. */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is * whether a pointer is safe to pass to a kfunc or BPF helper function. * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF * helpers, they do not cover all possible instances of unsafe * pointers. For example, a pointer that was obtained from walking a * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the * fact that it may be NULL, invalid, etc. This is due to backwards * compatibility requirements, as this was the behavior that was first * introduced when kptrs were added. The behavior is now considered * deprecated, and PTR_UNTRUSTED will eventually be removed. * * PTR_TRUSTED, on the other hand, is a pointer that the kernel * guarantees to be valid and safe to pass to kfuncs and BPF helpers. * For example, pointers passed to tracepoint arguments are considered * PTR_TRUSTED, as are pointers that are passed to struct_ops * callbacks. As alluded to above, pointers that are obtained from * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a * struct task_struct *task is PTR_TRUSTED, then accessing * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored * in a BPF register. Similarly, pointers passed to certain programs * types such as kretprobes are not guaranteed to be valid, as they may * for example contain an object that was recently freed. */ PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. * Currently only valid for linked-list and rbtree nodes. If the nodes * have a bpf_refcount_field, they must be tagged MEM_RCU as well. */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), /* DYNPTR points to sk_buff */ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), /* DYNPTR points to xdp_buff */ DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), /* Memory must be aligned on some architectures, used in combination with * MEM_FIXED_SIZE. */ MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS), /* MEM is being written to, often combined with MEM_UNINIT. Non-presence * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the * MEM_UNINIT means that memory needs to be initialized since it is also * read. */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), /* DYNPTR points to file */ DYNPTR_TYPE_FILE = BIT(20 + BPF_BASE_TYPE_BITS), __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) /* Max number of all types. */ #define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ /* the following constraints used to prototype * bpf_map_lookup/update/delete_elem() functions */ ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ /* Used to prototype bpf_memcmp() and other functions that access data * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ ARG_PTR_TO_ARENA, ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* Pointer to memory does not need to be initialized, since helper function * fills all bytes or clears them in error case. */ ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */ ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* type of values returned from helper functions */ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ RET_PTR_TO_MEM, /* returns a pointer to memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, /* Extended ret_types. */ RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying */ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; bool might_sleep; /* set to true if helper follows contract for llvm * attribute bpf_fastcall: * - void functions do not scratch r0 * - functions taking N arguments scratch only registers r1-rN */ bool allow_fastcall; enum bpf_return_type ret_type; union { struct { enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; enum bpf_arg_type arg3_type; enum bpf_arg_type arg4_type; enum bpf_arg_type arg5_type; }; enum bpf_arg_type arg_type[5]; }; union { struct { u32 *arg1_btf_id; u32 *arg2_btf_id; u32 *arg3_btf_id; u32 *arg4_btf_id; u32 *arg5_btf_id; }; u32 *arg_btf_id[5]; struct { size_t arg1_size; size_t arg2_size; size_t arg3_size; size_t arg4_size; size_t arg5_size; }; size_t arg_size[5]; }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is * the first argument to eBPF programs. * For socket filters: 'struct bpf_context *' == 'struct sk_buff *' */ struct bpf_context; enum bpf_access_type { BPF_READ = 1, BPF_WRITE = 2 }; /* types of values stored in eBPF registers */ /* Pointer types represent: * pointer * pointer + imm * pointer + (u16) var * pointer + (u16) var + imm * if (range > 0) then [ptr, ptr + range - off) is safe to access * if (id > 0) means that some 'var' was added * if (off > 0) means that 'imm' was added */ enum bpf_reg_type { NOT_INIT = 0, /* nothing was written into register */ SCALAR_VALUE, /* reg doesn't contain a valid pointer */ PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need * to be null checked by the BPF program. This does not imply the * pointer is _not_ null and in practice this can easily be a null * pointer when reading pointer chains. The assumption is program * context will handle null pointer dereference typically via fault * handling. The verifier must keep this in mind and can make no * assumptions about null or non-null when doing branch analysis. * Further, when passed into helpers the helpers can not, without * additional context, assume the value is non-null. */ PTR_TO_BTF_ID, PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ PTR_TO_INSN, /* reg points to a bpf program instruction */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, }; static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); /* The information passed from prog-specific *_is_valid_access * back to the verifier. */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; bool is_ldsx; union { int ctx_field_size; struct { struct btf *btf; u32 btf_id; u32 ref_obj_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ bool is_retval; /* is accessing function return value ? */ }; static inline void bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) { aux->ctx_field_size = size; } static bool bpf_is_ldimm64(const struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } static inline bool bpf_pseudo_func(const struct bpf_insn *insn) { return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } /* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an * atomic load or store, and false if it is a read-modify-write instruction. */ static inline bool bpf_atomic_is_load_store(const struct bpf_insn *atomic_insn) { switch (atomic_insn->imm) { case BPF_LOAD_ACQ: case BPF_STORE_REL: return true; default: return false; } } struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); }; struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * (*get_func_proto)(enum bpf_func_id func_id, const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, s16 ctx_stack_off); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); }; struct bpf_prog_offload_ops { /* verifier basic callbacks */ int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); /* verifier optimization callbacks (called after .finalize) */ int (*replace_insn)(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn); int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt); /* program management callbacks */ int (*prepare)(struct bpf_prog *prog); int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; struct bpf_offload_dev *offdev; void *dev_priv; struct list_head offloads; bool dev_state; bool opt_failed; void *jited_image; u32 jited_len; }; /* The longest tracepoint has 12 args. * See include/trace/bpf_probe.h */ #define MAX_BPF_FUNC_ARGS 12 /* The maximum number of arguments passed through registers * a single function may have. */ #define MAX_BPF_FUNC_REG_ARGS 5 /* The argument is a structure or a union. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ #define BTF_FMODEL_SIGNED_ARG BIT(1) struct btf_func_model { u8 ret_size; u8 ret_flags; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function * continue executing. This flag is used for fentry progs when there are no * fexit progs. */ #define BPF_TRAMP_F_RESTORE_REGS BIT(0) /* Call original function after fentry progs, but before fexit progs. * Makes sense for fentry/fexit, normal calls and indirect calls. */ #define BPF_TRAMP_F_CALL_ORIG BIT(1) /* Skip current frame and return to parent. Makes sense for fentry/fexit * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) /* Return the return value of fentry prog. Only used by bpf_struct_ops. */ #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Get original function from stack instead of from provided direct address. * Makes sense for trampolines with fexit or fmod_ret programs. */ #define BPF_TRAMP_F_ORIG_STACK BIT(5) /* This trampoline is on a function with another ftrace_ops with IPMODIFY, * e.g., a live patch. This flag is set and cleared by ftrace call backs, */ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) /* Indicate that current trampoline is in a tail call context. Then, it has to * cache and restore tail_call_cnt to avoid infinite tail call loop. */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) /* * Indicate the trampoline should be suitable to receive indirect calls; * without this indirectly calling the generated code can result in #UD/#CP, * depending on the CFI options. * * Used by bpf_struct_ops. * * Incompatible with FENTRY usage, overloads @func_addr argument. */ #define BPF_TRAMP_F_INDIRECT BIT(8) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ enum { #if defined(__s390x__) BPF_MAX_TRAMP_LINKS = 27, #else BPF_MAX_TRAMP_LINKS = 38, #endif }; struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; }; struct bpf_tramp_run_ctx; /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS * fentry = a set of programs to run before returning from trampoline * * 2. replace nop at the function entry (kprobe + kretprobe equivalent) * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME * orig_call = fentry_ip + MCOUNT_INSN_SIZE * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function * * 3. replace direct call instruction anywhere in the function body * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) * With flags = 0 * fentry = a set of programs to run before returning from trampoline * With flags = BPF_TRAMP_F_CALL_ORIG * orig_call = original callback addr or direct function addr * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP static inline bool bpf_trampoline_use_jmp(u64 flags) { return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME); } #else static inline bool bpf_trampoline_use_jmp(u64 flags) { return false; } #endif struct bpf_ksym { unsigned long start; unsigned long end; char name[KSYM_NAME_LEN]; struct list_head lnode; struct latch_tree_node tnode; bool prog; u32 fp_start; u32 fp_end; }; enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ }; struct bpf_tramp_image { void *image; int size; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; void *ip_epilogue; union { struct rcu_head rcu; struct work_struct work; }; }; struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; struct { struct btf_func_model model; void *addr; bool ftrace_managed; } func; /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF * program by replacing one of its functions. func.addr is the address * of the function it replaced. */ struct bpf_prog *extension_prog; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; }; struct bpf_attach_target_info { struct btf_func_model fmodel; long tgt_addr; struct module *tgt_mod; const char *tgt_name; const struct btf_type *tgt_type; }; #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ struct bpf_dispatcher_prog { struct bpf_prog *prog; refcount_t users; }; struct bpf_dispatcher { /* dispatcher mutex */ struct mutex mutex; void *func; struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; void *rw_image; u32 image_off; struct bpf_ksym ksym; #ifdef CONFIG_HAVE_STATIC_CALL struct static_call_key *sc_key; void *sc_tramp; #endif }; #ifndef __bpfcall #define __bpfcall __nocfi #endif static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) { return bpf_func(ctx, insnsi); } /* the implementation of the opaque uapi struct bpf_dynptr */ struct bpf_dynptr_kern { void *data; /* Size represents the number of usable bytes of dynptr data. * If for example the offset is at 4 for a local dynptr whose data is * of type u64, the number of usable bytes is 4. * * The upper 8 bits are reserved. It is as follows: * Bits 0 - 23 = size * Bits 24 - 30 = dynptr type * Bit 31 = whether dynptr is read-only */ u32 size; u32 offset; } __aligned(8); enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */ BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, /* Points to skb_metadata_end()-skb_metadata_len() */ BPF_DYNPTR_TYPE_SKB_META, /* Underlying data is a file */ BPF_DYNPTR_TYPE_FILE, }; int bpf_dynptr_check_size(u64 size); u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags); void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, void *buffer__opt, u64 buffer__szk); static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { u64 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; return 0; } #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); /* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does * not have STATIC_CALL, avoid a double-indirection. */ #ifdef CONFIG_HAVE_STATIC_CALL #define __BPF_DISPATCHER_SC_INIT(_name) \ .sc_key = &STATIC_CALL_KEY(_name), \ .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name), #define __BPF_DISPATCHER_SC(name) \ DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func) #define __BPF_DISPATCHER_CALL(name) \ static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func) #define __BPF_DISPATCHER_UPDATE(_d, _new) \ __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new)) #else #define __BPF_DISPATCHER_SC_INIT(name) #define __BPF_DISPATCHER_SC(name) #define __BPF_DISPATCHER_CALL(name) bpf_func(ctx, insnsi) #define __BPF_DISPATCHER_UPDATE(_d, _new) #endif #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ .progs = {}, \ .num_progs = 0, \ .image = NULL, \ .image_off = 0, \ .ksym = { \ .name = #_name, \ .lnode = LIST_HEAD_INIT(_name.ksym.lnode), \ }, \ __BPF_DISPATCHER_SC_INIT(_name##_call) \ } #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ { \ return __BPF_DISPATCHER_CALL(name); \ } \ EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \ struct bpf_dispatcher bpf_dispatcher_##name = \ BPF_DISPATCHER_INIT(bpf_dispatcher_##name); #define DECLARE_BPF_DISPATCHER(name) \ unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func); \ extern struct bpf_dispatcher bpf_dispatcher_##name; #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name) void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym); void bpf_image_ksym_add(struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); bool bpf_has_frame_pointer(unsigned long ip); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { return -ENOTSUPP; } static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { return -ENOTSUPP; } static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { return NULL; } static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) #define DECLARE_BPF_DISPATCHER(name) #define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func #define BPF_DISPATCHER_PTR(name) NULL static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) {} static inline bool is_bpf_image_address(unsigned long address) { return false; } static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { return false; } #endif struct bpf_func_info_aux { u16 linkage; bool unreliable; bool called : 1; bool verified : 1; }; enum bpf_jit_poke_reason { BPF_POKE_REASON_TAIL_CALL, }; /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; void *aux; union { struct { struct bpf_map *map; u32 key; } tail_call; }; bool tailcall_target_stable; u8 adj_off; u16 reason; u32 insn_idx; }; /* reg_type info for ctx arguments */ struct bpf_ctx_arg_aux { u32 offset; enum bpf_reg_type reg_type; struct btf *btf; u32 btf_id; u32 ref_obj_id; bool refcounted; }; struct btf_mod_pair { struct btf *btf; struct module *module; }; struct bpf_kfunc_desc_tab; enum bpf_stream_id { BPF_STDOUT = 1, BPF_STDERR = 2, }; struct bpf_stream_elem { struct llist_node node; int total_len; int consumed_len; char str[]; }; enum { /* 100k bytes */ BPF_STREAM_MAX_CAPACITY = 100000ULL, }; struct bpf_stream { atomic_t capacity; struct llist_head log; /* list of in-flight stream elements in LIFO order */ struct mutex lock; /* lock protecting backlog_{head,tail} */ struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */ struct llist_node *backlog_tail; /* tail of the list above */ }; struct bpf_stream_stage { struct llist_head log; int len; }; struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; u32 used_btf_cnt; u32 max_ctx_offset; u32 max_pkt_offset; u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ u32 attach_st_ops_member_off; u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; u32 subprog_start; struct btf *attach_btf; struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; enum bpf_prog_type saved_dst_prog_type; enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool tail_call_reachable; bool xdp_has_frags; bool exception_cb; bool exception_boundary; bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */ /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; struct bpf_prog_aux *main_prog_aux; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; #ifdef CONFIG_FINEIBT struct bpf_ksym ksym_prefix; #endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; const struct bpf_struct_ops *st_ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ u32 verified_insns; int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; struct bpf_func_info_aux *func_info_aux; /* bpf_line_info loaded from userspace. linfo->insn_off * has the xlated insn offset. * Both the main and sub prog share the same linfo. * The subprog can access its first linfo by * using the linfo_idx. */ struct bpf_line_info *linfo; /* jited_linfo is the jited addr of the linfo. It has a * one to one mapping to linfo: * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. * Both the main and sub prog share the same jited_linfo. * The subprog can access its first jited_linfo by * using the linfo_idx. */ void **jited_linfo; u32 func_info_cnt; u32 nr_linfo; /* subprog can use linfo_idx to access its first linfo and * jited_linfo. * main prog always has linfo_idx == 0 */ u32 linfo_idx; struct module *mod; u32 num_exentries; struct exception_table_entry *extable; union { struct work_struct work; struct rcu_head rcu; }; struct bpf_stream stream[2]; }; struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ blinding_requested:1, /* needs constant blinding */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ union { u8 digest[SHA256_DIGEST_SIZE]; u8 tag[BPF_TAG_SIZE]; }; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ union { DECLARE_FLEX_ARRAY(struct sock_filter, insns); DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); }; }; struct bpf_array_aux { /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; struct mutex poke_mutex; struct work_struct work; }; struct bpf_link { atomic64_t refcnt; u32 id; enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; u32 flags; enum bpf_attach_type attach_type; /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ union { struct rcu_head rcu; struct work_struct work; }; /* whether BPF link itself has "sleepable" semantics, which can differ * from underlying BPF program having a "sleepable" semantics, as BPF * link's semantics is determined by target attach hook */ bool sleepable; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); /* deallocate link resources callback, called without RCU grace period * waiting */ void (*dealloc)(struct bpf_link *link); /* deallocate link resources callback, called after RCU grace period; * if either the underlying BPF program is sleepable or BPF link's * target hook is sleepable, we'll go through tasks trace RCU GP and * then "classic" RCU GP; this need for chaining tasks trace and * classic RCU GPs is designated by setting bpf_link->sleepable flag */ void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *old_map); __poll_t (*poll)(struct file *file, struct poll_table_struct *pts); }; struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; u64 cookie; }; struct bpf_shim_tramp_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; }; struct bpf_tracing_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; u64 cookie; }; struct bpf_link_primer { struct bpf_link *link; struct file *file; int fd; u32 id; }; struct bpf_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; /* BPF token-related delegation options */ u64 delegate_cmds; u64 delegate_maps; u64 delegate_progs; u64 delegate_attachs; }; struct bpf_token { struct work_struct work; atomic64_t refcnt; struct user_namespace *userns; u64 allowed_cmds; u64 allowed_maps; u64 allowed_progs; u64 allowed_attachs; #ifdef CONFIG_SECURITY void *security; #endif }; struct bpf_struct_ops_value; struct btf_member; #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 /** * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to * define a BPF_MAP_TYPE_STRUCT_OPS map type composed * of BPF_PROG_TYPE_STRUCT_OPS progs. * @verifier_ops: A structure of callbacks that are invoked by the verifier * when determining whether the struct_ops progs in the * struct_ops map are valid. * @init: A callback that is invoked a single time, and before any other * callback, to initialize the structure. A nonzero return value means * the subsystem could not be initialized. * @check_member: When defined, a callback invoked by the verifier to allow * the subsystem to determine if an entry in the struct_ops map * is valid. A nonzero return value means that the map is * invalid and should be rejected by the verifier. * @init_member: A callback that is invoked for each member of the struct_ops * map to allow the subsystem to initialize the member. A nonzero * value means the member could not be initialized. This callback * is exclusive with the @type, @type_id, @value_type, and * @value_id fields. * @reg: A callback that is invoked when the struct_ops map has been * initialized and is being attached to. Zero means the struct_ops map * has been successfully registered and is live. A nonzero return value * means the struct_ops map could not be registered. * @unreg: A callback that is invoked when the struct_ops map should be * unregistered. * @update: A callback that is invoked when the live struct_ops map is being * updated to contain new values. This callback is only invoked when * the struct_ops map is loaded with BPF_F_LINK. If not defined, the * it is assumed that the struct_ops map cannot be updated. * @validate: A callback that is invoked after all of the members have been * initialized. This callback should perform static checks on the * map, meaning that it should either fail or succeed * deterministically. A struct_ops map that has been validated may * not necessarily succeed in being registered if the call to @reg * fails. For example, a valid struct_ops map may be loaded, but * then fail to be registered due to there being another active * struct_ops map on the system in the subsystem already. For this * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. * @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs * provide the correct Control Flow Integrity hashes for the * trampolines generated by BPF struct_ops. * @owner: The module that owns this struct_ops. Used for module reference * counting to ensure the module providing the struct_ops cannot be * unloaded while in use. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models */ struct bpf_struct_ops { const struct bpf_verifier_ops *verifier_ops; int (*init)(struct btf *btf); int (*check_member)(const struct btf_type *t, const struct btf_member *member, const struct bpf_prog *prog); int (*init_member)(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata); int (*reg)(void *kdata, struct bpf_link *link); void (*unreg)(void *kdata, struct bpf_link *link); int (*update)(void *kdata, void *old_kdata, struct bpf_link *link); int (*validate)(void *kdata); void *cfi_stubs; struct module *owner; const char *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; }; /* Every member of a struct_ops type has an instance even a member is not * an operator (function pointer). The "info" field will be assigned to * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the * argument information required by the verifier to verify the program. * * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the * corresponding entry for an given argument. */ struct bpf_struct_ops_arg_info { struct bpf_ctx_arg_aux *info; u32 cnt; }; struct bpf_struct_ops_desc { struct bpf_struct_ops *st_ops; const struct btf_type *type; const struct btf_type *value_type; u32 type_id; u32 value_id; /* Collection of argument information for each member */ struct bpf_struct_ops_arg_info *arg_info; }; enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE, BPF_STRUCT_OPS_STATE_READY, }; struct bpf_struct_ops_common_value { refcount_t refcnt; enum bpf_struct_ops_state state; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register * a struct_ops type instead of calling __register_bpf_struct_ops() directly. */ #define register_bpf_struct_ops(st_ops, type) \ ({ \ struct bpf_struct_ops_##type { \ struct bpf_struct_ops_common_value common; \ struct type data ____cacheline_aligned_in_smp; \ }; \ BTF_TYPE_EMIT(struct bpf_struct_ops_##type); \ __register_bpf_struct_ops(st_ops); \ }) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *stub_func, void **image, u32 *image_off, bool allow_alloc); void bpf_struct_ops_image_free(void *image); static inline bool bpf_try_module_get(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) return bpf_struct_ops_get(data); else return try_module_get(owner); } static inline void bpf_module_put(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) bpf_struct_ops_put(data); else module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ struct bpf_dummy_ops_state { int val; }; struct bpf_dummy_ops { int (*test_1)(struct bpf_dummy_ops_state *cb); int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2, char a3, unsigned long a4); int (*test_sleepable)(struct bpf_dummy_ops_state *cb); }; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log); void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map); void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc); #else #define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; }) static inline bool bpf_try_module_get(const void *data, struct module *owner) { return try_module_get(owner); } static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) { return -ENOTSUPP; } static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) { return -EINVAL; } static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) { } #endif int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype, enum bpf_attach_type attach_type); void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype, enum bpf_attach_type attach_type) { return -EOPNOTSUPP; } static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) { } #endif struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; struct bpf_array_aux *aux; union { DECLARE_FLEX_ARRAY(char, value) __aligned(8); DECLARE_FLEX_ARRAY(void *, ptrs) __aligned(8); DECLARE_FLEX_ARRAY(void __percpu *, pptrs) __aligned(8); }; }; /* * The bpf_array_get_next_key() function may be used for all array-like * maps, i.e., maps with u32 keys with range [0 ,..., max_entries) */ int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key); #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 /* Maximum number of loops for bpf_loop and bpf_iter_num. * It's enum to expose it (and thus make it discoverable) through BTF. */ enum { BPF_MAX_LOOPS = 8 * 1024 * 1024, BPF_MAX_TIMED_LOOPS = 0xffff, }; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ BPF_F_WRONLY | \ BPF_F_WRONLY_PROG) #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) /* Maximum number of user-producer ring buffer samples that can be drained in * a call to bpf_user_ringbuf_drain(). */ #define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is * not possible. */ if (access_flags & BPF_F_RDONLY_PROG) return BPF_MAP_CAN_READ; else if (access_flags & BPF_F_WRONLY_PROG) return BPF_MAP_CAN_WRITE; else return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; } static inline bool bpf_map_flags_access_ok(u32 access_flags) { return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); } static inline struct bpf_map_owner *bpf_map_owner_alloc(struct bpf_map *map) { return kzalloc(sizeof(*map->owner), GFP_ATOMIC); } static inline void bpf_map_owner_free(struct bpf_map *map) { kfree(map->owner); } struct bpf_event_entry { struct perf_event *event; struct file *perf_file; struct file *map_file; struct rcu_head rcu; }; static inline bool map_type_contains_progs(struct bpf_map *map) { return map->map_type == BPF_MAP_TYPE_PROG_ARRAY || map->map_type == BPF_MAP_TYPE_DEVMAP || map->map_type == BPF_MAP_TYPE_CPUMAP; } bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); /* an array of programs to be executed under rcu_lock. * * Typical usage: * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. * The user has to keep refcnt on the program and make sure the program * is removed from the array before bpf_prog_put(). * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ struct bpf_prog_array_item { struct bpf_prog *prog; union { struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; u64 bpf_cookie; }; }; struct bpf_prog_array { struct rcu_head rcu; struct bpf_prog_array_item items[]; }; struct bpf_empty_prog_array { struct bpf_prog_array hdr; struct bpf_prog *null_prog; }; /* to avoid allocating empty bpf_prog_array for cgroups that * don't have bpf program attached use one global 'bpf_empty_prog_array' * It will not be modified the caller of bpf_prog_array_alloc() * (since caller requested prog_cnt == 0) * that pointer should be 'freed' by bpf_prog_array_free() */ extern struct bpf_empty_prog_array bpf_empty_prog_array; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); /* Use when traversal over the bpf_prog_array uses tasks_trace rcu */ void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index); int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog); int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; const struct bpf_prog_array_item *prog_item; int retval; }; struct bpf_trace_run_ctx { struct bpf_run_ctx run_ctx; u64 bpf_cookie; bool is_uprobe; }; struct bpf_tramp_run_ctx { struct bpf_run_ctx run_ctx; u64 bpf_cookie; struct bpf_run_ctx *saved_run_ctx; }; static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; #ifdef CONFIG_BPF_SYSCALL old_ctx = current->bpf_ctx; current->bpf_ctx = new_ctx; #endif return old_ctx; } static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) { #ifdef CONFIG_BPF_SYSCALL current->bpf_ctx = old_ctx; #endif } /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); static __always_inline u32 bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); if (unlikely(!array)) return ret; run_ctx.is_uprobe = false; migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } bpf_reset_run_ctx(old_run_ctx); migrate_enable(); return ret; } /* Notes on RCU design for bpf_prog_arrays containing sleepable programs: * * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array * overall. As a result, we must use the bpf_prog_array_free_sleepable * in order to use the tasks_trace rcu grace period. * * When a non-sleepable program is inside the array, we take the rcu read * section and disable preemption for that program alone, so it can access * rcu-protected dynamically sized maps. */ static __always_inline u32 bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; might_fault(); RCU_LOCKDEP_WARN(!rcu_read_lock_trace_held(), "no rcu lock held"); if (unlikely(!array)) return ret; migrate_disable(); run_ctx.is_uprobe = true; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { if (!prog->sleepable) rcu_read_lock(); run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; if (!prog->sleepable) rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); migrate_enable(); return ret; } bool bpf_jit_bypass_spec_v1(void); bool bpf_jit_bypass_spec_v4(void); #define bpf_rcu_lock_held() \ (rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held()) #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; /* * Block execution of BPF programs attached to instrumentation (perf, * kprobes, tracepoints) to prevent deadlocks on map operations as any of * these events can happen inside a region which holds a map bucket lock * and can deadlock on it. */ static inline void bpf_disable_instrumentation(void) { migrate_disable(); this_cpu_inc(bpf_prog_active); } static inline void bpf_enable_instrumentation(void) { this_cpu_dec(bpf_prog_active); migrate_enable(); } extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; extern const struct file_operations bpf_token_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog); void bpf_map_free_id(struct bpf_map *map); struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, u32 field_mask); void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); /* * The __bpf_map_get() and __btf_get_by_fd() functions parse a file * descriptor and return a corresponding map or btf object. * Their names are double underscored to emphasize the fact that they * do not increase refcnt. To also increase refcnt use corresponding * bpf_map_get() and btf_get_by_fd() functions. */ static inline struct bpf_map *__bpf_map_get(struct fd f) { if (fd_empty(f)) return ERR_PTR(-EBADF); if (unlikely(fd_file(f)->f_op != &bpf_map_fops)) return ERR_PTR(-EINVAL); return fd_file(f)->private_data; } static inline struct btf *__btf_get_by_fd(struct fd f) { if (fd_empty(f)) return ERR_PTR(-EBADF); if (unlikely(fd_file(f)->f_op != &btf_fops)) return ERR_PTR(-EINVAL); return fd_file(f)->private_data; } void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags); void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else /* * These specialized allocators have to be macros for their allocations to be * accounted separately (to have separate alloc_tag). */ #define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ kmalloc_node(_size, _flags, _node) #define bpf_map_kmalloc_nolock(_map, _size, _flags, _node) \ kmalloc_nolock(_size, _flags, _node) #define bpf_map_kzalloc(_map, _size, _flags) \ kzalloc(_size, _flags) #define bpf_map_kvcalloc(_map, _n, _size, _flags) \ kvcalloc(_n, _size, _flags) #define bpf_map_alloc_percpu(_map, _size, _align, _flags) \ __alloc_percpu_gfp(_size, _align, _flags) #endif static inline int bpf_map_init_elem_count(struct bpf_map *map) { size_t size = sizeof(*map->elem_count), align = size; gfp_t flags = GFP_USER | __GFP_NOWARN; map->elem_count = bpf_map_alloc_percpu(map, size, align, flags); if (!map->elem_count) return -ENOMEM; return 0; } static inline void bpf_map_free_elem_count(struct bpf_map *map) { free_percpu(map->elem_count); } static inline void bpf_map_inc_elem_count(struct bpf_map *map) { this_cpu_inc(*map->elem_count); } static inline void bpf_map_dec_elem_count(struct bpf_map *map) { this_cpu_dec(*map->elem_count); } extern int sysctl_unprivileged_bpf_disabled; bool bpf_token_capable(const struct bpf_token *token, int cap); static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) { return bpf_token_capable(token, CAP_PERFMON); } static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) { return bpf_token_capable(token, CAP_PERFMON); } static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { return bpf_jit_bypass_spec_v1() || cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { return bpf_jit_bypass_spec_v4() || cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type); void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type, bool sleepable); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); void bpf_token_inc(struct bpf_token *token); void bpf_token_put(struct bpf_token *token); int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); int bpf_token_get_info_by_fd(struct bpf_token *token, const union bpf_attr *attr, union bpf_attr __user *uattr); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); bool bpf_token_allow_prog_type(const struct bpf_token *token, enum bpf_prog_type prog_type, enum bpf_attach_type attach_type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } /* * The task type of iterators. * * For BPF task iterators, they can be parameterized with various * parameters to visit only some of tasks. * * BPF_TASK_ITER_ALL (default) * Iterate over resources of every task. * * BPF_TASK_ITER_TID * Iterate over resources of a task/tid. * * BPF_TASK_ITER_TGID * Iterate over resources of every task of a process / task group. */ enum bpf_iter_task_type { BPF_TASK_ITER_ALL = 0, BPF_TASK_ITER_TID, BPF_TASK_ITER_TGID, }; struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; /* for cgroup iter */ struct { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; struct { enum bpf_iter_task_type type; u32 pid; } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, struct seq_file *seq); typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); typedef const struct bpf_func_proto * (*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id, const struct bpf_prog *prog); enum bpf_iter_feature { BPF_ITER_RESCHED = BIT(0), }; #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; bpf_iter_get_func_proto_t get_func_proto; u32 ctx_arg_info_size; u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; struct bpf_iter_meta { __bpf_md_ptr(struct seq_file *, seq); u64 session_id; u64 seq_num; }; struct bpf_iter__bpf_map_elem { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(void *, value); }; int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); int bpf_iter_prog_supported(struct bpf_prog *prog); const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq); int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(struct list_head *flush_list); int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, const struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) { return (attr->map_flags & BPF_F_NUMA_NODE) ? attr->numa_node : NUMA_NO_NODE; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); static inline bool bpf_tracing_ctx_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; return true; } static inline bool bpf_tracing_btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_tracing_ctx_access(off, size, type)) return false; return btf_ctx_access(off, size, type, prog, info); } int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, bool strict); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *func_proto, const char *func_name, struct btf_func_model *m); struct bpf_reg_state; int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key); int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key, int last_id); struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, u16 btf_fd_idx, u8 **func_addr); struct bpf_core_ctx { struct bpf_verifier_log *log; const struct btf *btf; }; bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, const char *field_name, u32 btf_id, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, const struct btf *arg_btf, u32 arg_id); int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); static inline bool unprivileged_ebpf_enabled(void) { return !sysctl_unprivileged_bpf_disabled; } /* Not all bpf prog type has the bpf_ctx. * For the bpf prog type that has initialized the bpf_ctx, * this function can be used to decide if a kernel function * is called by a bpf program. */ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_add(struct bpf_prog *prog, int i) { } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) { } static inline void bpf_prog_put(struct bpf_prog *prog) { } static inline void bpf_prog_inc(struct bpf_prog *prog) { } static inline struct bpf_prog *__must_check bpf_prog_inc_not_zero(struct bpf_prog *prog) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type) { } static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type, bool sleepable) { } static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { return -EOPNOTSUPP; } static inline int bpf_link_settle(struct bpf_link_primer *primer) { return -EOPNOTSUPP; } static inline void bpf_link_cleanup(struct bpf_link_primer *primer) { } static inline void bpf_link_inc(struct bpf_link *link) { } static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { return NULL; } static inline void bpf_link_put(struct bpf_link *link) { } static inline int bpf_obj_get_user(const char __user *pathname, int flags) { return -EOPNOTSUPP; } static inline bool bpf_token_capable(const struct bpf_token *token, int cap) { return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); } static inline void bpf_token_inc(struct bpf_token *token) { } static inline void bpf_token_put(struct bpf_token *token) { } static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); } static inline int bpf_token_get_info_by_fd(struct bpf_token *token, const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EOPNOTSUPP; } static inline void __dev_flush(struct list_head *flush_list) { } struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; } struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, const struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { return 0; } static inline void __cpu_map_flush(struct list_head *flush_list) { } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { return ERR_PTR(-EOPNOTSUPP); } static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } static inline void bpf_map_put(struct bpf_map *map) { } static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } static inline int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name) { return -EACCES; } static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline void bpf_task_storage_free(struct task_struct *task) { } static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) { return false; } static inline const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn) { return NULL; } static inline int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, u16 btf_fd_idx, u8 **func_addr) { return -ENOTSUPP; } static inline bool unprivileged_ebpf_enabled(void) { return false; } static inline bool has_current_bpf_ctx(void) { return false; } static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) { } static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { } static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { } static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) { } #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = -EFAULT; if (IS_ENABLED(CONFIG_BPF_EVENTS)) ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len); static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { return bpf_prog_get_type_dev(ufd, type, false); } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len); bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_dev_bound_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); struct bpf_offload_dev * bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); void unpriv_ebpf_notify(int new_state); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux); void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id); int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr); int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog); void bpf_dev_bound_netdev_unregister(struct net_device *dev); static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return aux->dev_bound; } static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux) { return aux->offload_requested; } bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs); static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return unlikely(map->ops == &bpf_map_offload_ops); } struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog); void sock_map_unhash(struct sock *sk); void sock_map_destroy(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, struct bpf_prog_aux *prog_aux) { return -EOPNOTSUPP; } static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) { return NULL; } static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) { return -EOPNOTSUPP; } static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) { return -EOPNOTSUPP; } static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev) { } static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return false; } static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux) { return false; } static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) { return false; } static inline bool bpf_map_is_offloaded(struct bpf_map *map) { return false; } static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_map_offload_map_free(struct bpf_map *map) { } static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) { return 0; } static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } #ifdef CONFIG_BPF_SYSCALL static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EOPNOTSUPP; } static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { return -EOPNOTSUPP; } static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ static __always_inline void bpf_prog_inc_misses_counters(const struct bpf_prog_array *array) { const struct bpf_prog_array_item *item; struct bpf_prog *prog; if (unlikely(!array)) return; item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { bpf_prog_inc_misses_counter(prog); item++; } } #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags); #else static inline void bpf_sk_reuseport_detach(struct sock *sk) { } #ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { return -EOPNOTSUPP; } static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EOPNOTSUPP; } #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ #if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); struct bpf_key *bpf_lookup_system_key(u64 id); void bpf_key_put(struct bpf_key *bkey); int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring); #else static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) { return NULL; } static inline struct bpf_key *bpf_lookup_system_key(u64 id) { return NULL; } static inline void bpf_key_put(struct bpf_key *bkey) { } static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { return -EOPNOTSUPP; } #endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_stackid_proto_pe; extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_spin_lock_proto; extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_event_output_data_proto; extern const struct bpf_func_proto bpf_ringbuf_output_proto; extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } static inline int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_INET struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; struct sock *migrating_sk; void *data_end; u32 hash; u32 reuseport_id; bool bind_inany; }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); #else static inline bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } static inline bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { return false; } static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { return 0; } #endif /* CONFIG_INET */ enum bpf_text_poke_type { BPF_MOD_NOP, BPF_MOD_CALL, BPF_MOD_JUMP, }; int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, enum bpf_text_poke_type new_t, void *old_addr, void *new_addr); void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old); void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 #define MAX_BPRINTF_BUF 1024 /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary * arguments representation. */ #define MAX_BPRINTF_BIN_ARGS 512 struct bpf_bprintf_buffers { char bin_args[MAX_BPRINTF_BIN_ARGS]; char buf[MAX_BPRINTF_BUF]; }; struct bpf_bprintf_data { u32 *bin_args; char *buf; bool get_bin_args; bool get_buf; }; int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); void bpf_put_buffers(void); void bpf_prog_stream_init(struct bpf_prog *prog); void bpf_prog_stream_free(struct bpf_prog *prog); int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len); void bpf_stream_stage_init(struct bpf_stream_stage *ss); void bpf_stream_stage_free(struct bpf_stream_stage *ss); __printf(2, 3) int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, enum bpf_stream_id stream_id); int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); #define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__) #define bpf_stream_dump_stack(ss) bpf_stream_stage_dump_stack(&ss) #define bpf_stream_stage(ss, prog, stream_id, expr) \ ({ \ bpf_stream_stage_init(&ss); \ (expr); \ bpf_stream_stage_commit(&ss, prog, stream_id); \ bpf_stream_stage_free(&ss); \ }) #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); void bpf_cgroup_atype_put(int cgroup_atype); #else static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ struct key; #ifdef CONFIG_KEYS struct bpf_key { struct key *key; bool has_ref; }; #endif /* CONFIG_KEYS */ static inline bool type_is_alloc(u32 type) { return type & MEM_ALLOC; } static inline gfp_t bpf_memcg_flags(gfp_t flags) { if (memcg_bpf_enabled()) return flags | __GFP_ACCOUNT; return flags; } static inline bool bpf_is_subprog(const struct bpf_prog *prog) { return prog->aux->func_idx != 0; } int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog); int bpf_insn_array_ready(struct bpf_map *map); void bpf_insn_array_release(struct bpf_map *map); void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len); void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len); #ifdef CONFIG_BPF_SYSCALL void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image); #else static inline void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) { } #endif static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { if (flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; return 0; } #endif /* _LINUX_BPF_H */
475 36 430 529 1457 323 276 172 182 84 11 121 503 249 5 17 27 6 12 23 23 423 423 321 650 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 /* SPDX-License-Identifier: GPL-2.0 */ /* * Connection state tracking for netfilter. This is separated from, * but required by, the (future) NAT layer; it can also be used by an iptables * extension. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack.h */ #ifndef _NF_CONNTRACK_H #define _NF_CONNTRACK_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <net/netfilter/nf_conntrack_tuple.h> struct nf_ct_udp { unsigned long stream_ts; }; /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; struct nf_ct_gre gre; unsigned int tmpl_padto; }; union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; struct nf_conntrack_net_ecache { struct delayed_work dwork; spinlock_t dying_lock; struct hlist_nulls_head dying_list; }; struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; unsigned int expect_count; /* only used from work queues, configuration plane, and so on: */ unsigned int users4; unsigned int users6; unsigned int users_bridge; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_net_ecache ecache; #endif }; #include <linux/types.h> #include <linux/skbuff.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> struct nf_conn { /* Usage count in here is 1 for hash table, 1 per skb, * plus 1 for any connection(s) we are `master' for * * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, * except that the latter uses internal indirection and does not * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; spinlock_t lock; /* jiffies32 when this ct is considered dead */ u32 timeout; #ifdef CONFIG_NF_CONNTRACK_ZONES struct nf_conntrack_zone zone; #endif /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ struct { } __nfct_init_offset; /* If we were expected by an expectation, this will be it */ struct nf_conn *master; #if defined(CONFIG_NF_CONNTRACK_MARK) u_int32_t mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK u_int32_t secmark; #endif /* Extensions */ struct nf_ct_ext *ext; /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; }; static inline struct nf_conn * nf_ct_to_nf_conn(const struct nf_conntrack *nfct) { return container_of(nfct, struct nf_conn, ct_general); } static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { return container_of(hash, struct nf_conn, tuplehash[hash->tuple.dst.dir]); } static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; } static inline u_int8_t nf_ct_protonum(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; } #define nf_ct_tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) /* get master conntrack via master expectation */ #define master_ct(conntr) (conntr->master) extern struct net init_net; static inline struct net *nf_ct_net(const struct nf_conn *ct) { return read_pnet(&ct->ct_net); } /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); /* Return conntrack_info and tuple hash for given skb. */ static inline struct nf_conn * nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) { unsigned long nfct = skb_get_nfct(skb); *ctinfo = nfct & NFCT_INFOMASK; return (struct nf_conn *)(nfct & NFCT_PTRMASK); } void nf_ct_destroy(struct nf_conntrack *nfct); void nf_conntrack_tcp_set_closing(struct nf_conn *ct); /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { if (ct && refcount_dec_and_test(&ct->ct_general.use)) nf_ct_destroy(&ct->ct_general); } /* load module; enable/disable conntrack in this namespace */ int nf_ct_netns_get(struct net *net, u8 nfproto); void nf_ct_netns_put(struct net *net, u8 nfproto); /* * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, u32 extra_jiffies, unsigned int bytes); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, ctinfo, extra_jiffies, skb->len); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, 0, extra_jiffies, 0); } /* kill conntrack and do accounting */ bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb); /* kill conntrack without accounting */ static inline bool nf_ct_kill(struct nf_conn *ct) { return nf_ct_delete(ct, 0, 0); } struct nf_ct_iter_data { struct net *net; void *data; u32 portid; int report; }; /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data); /* also set unconfirmed conntracks as dying. Only use in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data); struct nf_conntrack_zone; void nf_conntrack_free(struct nf_conn *ct); struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp); static inline int nf_ct_is_template(const struct nf_conn *ct) { return test_bit(IPS_TEMPLATE_BIT, &ct->status); } /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(const struct nf_conn *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } static inline int nf_ct_is_dying(const struct nf_conn *ct) { return test_bit(IPS_DYING_BIT, &ct->status); } /* Packet is received from loopback */ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) { return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } static inline void nf_conntrack_alter_reply(struct nf_conn *ct, const struct nf_conntrack_tuple *newreply) { /* Must be unconfirmed, so not in hash table yet */ if (WARN_ON(nf_ct_is_confirmed(ct))) return; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; } #define nfct_time_stamp ((u32)(jiffies)) /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; return max(timeout, 0); } static inline bool nf_ct_is_expired(const struct nf_conn *ct) { return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; } /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { if (!nf_ct_is_confirmed(ct)) return false; /* load ct->timeout after is_confirmed() test. * Pairs with __nf_conntrack_confirm() which: * 1. Increases ct->timeout value * 2. Inserts ct into rcu hlist * 3. Sets the confirmed bit * 4. Unlocks the hlist lock */ smp_acquire__after_ctrl_dep(); return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ static inline void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) { struct hlist_nulls_head *hptr; unsigned int sequence, hsz; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hsz = nf_conntrack_htable_size; hptr = nf_conntrack_hash; } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); *hash = hptr; *hsize = hsz; } struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags); void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); u32 nf_conntrack_count(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) { skb_set_nfct(skb, (unsigned long)ct | info); } extern unsigned int nf_conntrack_net_id; static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) { return net_generic(net, nf_conntrack_net_id); } int nf_ct_skb_network_trim(struct sk_buff *skb, int family); int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, u16 zone, u8 family, u8 *proto, u16 *mru); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper) #endif /* _NF_CONNTRACK_H */
24 21 18 18 1 24 16 24 24 24 9 339 7 97 189 337 337 296 337 164 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <linux/dma-mapping.h> #include <net/page_pool/types.h> #include <net/net_debug.h> #include <net/netmem.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_netmems(pool, gfp); } netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); if (unlikely(!netmem)) return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return netmem; } static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmem(pool, offset, size, gfp); } static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) { gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; return page_pool_alloc_netmems(pool, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) { atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { page_pool_fragment_netmem(page_to_netmem(page), nr); } static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) { atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(pp_ref_count, 1); return ret; } static inline long page_pool_unref_page(struct page *page, long nr) { return page_pool_unref_netmem(page_to_netmem(page), nr); } static inline void page_pool_ref_netmem(netmem_ref netmem) { atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); } static inline void page_pool_ref_page(struct page *page) { page_pool_ref_netmem(page_to_netmem(page)); } static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; } static inline void page_pool_put_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); #endif } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } static inline void page_pool_put_full_netmem(struct page_pool *pool, netmem_ref netmem, bool allow_direct) { page_pool_put_netmem(pool, netmem, -1, allow_direct); } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, netmem_ref netmem) { page_pool_put_full_netmem(pool, netmem, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) { dma_addr_t ret = netmem_get_dma_addr(netmem); if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, const dma_addr_t dma_addr, u32 offset, u32 dma_sync_size) { dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, offset + pool->p.offset, dma_sync_size, page_pool_get_dma_dir(pool)); } /** * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW * @pool: &page_pool the @page belongs to * @page: page to sync * @offset: offset from page start to "hard" start if using PP frags * @dma_sync_size: size of the data written to the page * * Can be used as a shorthand to sync Rx pages before accessing them in the * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. * Note that this version performs DMA sync unconditionally, even if the * associated PP doesn't perform sync-for-device. */ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, dma_sync_size); } static inline void page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, const netmem_ref netmem, u32 offset, u32 dma_sync_size) { if (!pool->dma_sync_for_cpu) return; __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr_netmem(netmem), offset, dma_sync_size); } static inline void page_pool_get(struct page_pool *pool) { refcount_inc(&pool->user_cnt); } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } /** * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU * @pool: queried page pool * * Check if page pool will return buffers which are unreadable to the CPU / * kernel. This will only be the case if user space bound a memory provider (mp) * which returns unreadable memory to the queue served by the page pool. * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound * this helper will return false. See also netif_rxq_has_unreadable_mp(). * * Return: true if memory allocated by the page pool may be unreadable */ static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; } #endif /* _NET_PAGE_POOL_HELPERS_H */
5 5 5 5 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * * Written by Hennus Bergman, 1992. * * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma. * In the previous version the reported device could end up being wrong, * if a device requested a DMA channel that was already in use. * [It also happened to remove the sizeof(char *) == sizeof(int) * assumption introduced because of those /proc/dma patches. -- Hennus] */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <asm/dma.h> /* A note on resource allocation: * * All drivers needing DMA channels, should allocate and release them * through the public routines `request_dma()' and `free_dma()'. * * In order to avoid problems, all processes should allocate resources in * the same sequence and release them in the reverse order. * * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA. * When releasing them, first release the DMA, then release the IRQ. * If you don't, you may cause allocation requests to fail unnecessarily. * This doesn't really matter now, but it will once we get real semaphores * in the kernel. */ DEFINE_SPINLOCK(dma_spin_lock); /* * If our port doesn't define this it has no PC like DMA */ #ifdef MAX_DMA_CHANNELS /* Channel n is busy iff dma_chan_busy[n].lock != 0. * DMA0 used to be reserved for DRAM refresh, but apparently not any more... * DMA4 is reserved for cascading. */ struct dma_chan { int lock; const char *device_id; }; static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { [4] = { 1, "cascade" }, }; /** * request_dma - request and reserve a system DMA channel * @dmanr: DMA channel number * @device_id: reserving device ID string, used in /proc/dma */ int request_dma(unsigned int dmanr, const char * device_id) { if (dmanr >= MAX_DMA_CHANNELS) return -EINVAL; if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) return -EBUSY; dma_chan_busy[dmanr].device_id = device_id; /* old flag was 0, now contains 1 to indicate busy */ return 0; } /* request_dma */ /** * free_dma - free a reserved system DMA channel * @dmanr: DMA channel number */ void free_dma(unsigned int dmanr) { if (dmanr >= MAX_DMA_CHANNELS) { printk(KERN_WARNING "Trying to free DMA%d\n", dmanr); return; } if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); return; } } /* free_dma */ #else int request_dma(unsigned int dmanr, const char *device_id) { return -EINVAL; } void free_dma(unsigned int dmanr) { } #endif #ifdef CONFIG_PROC_FS #ifdef MAX_DMA_CHANNELS static int proc_dma_show(struct seq_file *m, void *v) { int i; for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { if (dma_chan_busy[i].lock) { seq_printf(m, "%2d: %s\n", i, dma_chan_busy[i].device_id); } } return 0; } #else static int proc_dma_show(struct seq_file *m, void *v) { seq_puts(m, "No DMA\n"); return 0; } #endif /* MAX_DMA_CHANNELS */ static int __init proc_dma_init(void) { proc_create_single("dma", 0, NULL, proc_dma_show); return 0; } __initcall(proc_dma_init); #endif EXPORT_SYMBOL(request_dma); EXPORT_SYMBOL(free_dma); EXPORT_SYMBOL(dma_spin_lock);
7 7 7 7 7 7 7 7 23 21 21 21 19 16 16 16 16 19 19 19 19 19 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) "irq: " fmt #include <linux/acpi.h> #include <linux/debugfs.h> #include <linux/hardirq.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> #include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/fs.h> static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); static struct irq_domain *irq_default_domain; static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity); static void irq_domain_check_hierarchy(struct irq_domain *domain); static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq); struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; phys_addr_t *pa; }; #ifdef CONFIG_GENERIC_IRQ_DEBUGFS static void debugfs_add_domain_dir(struct irq_domain *d); static void debugfs_remove_domain_dir(struct irq_domain *d); #else static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); return fwid->name; } const struct fwnode_operations irqchip_fwnode_ops = { .get_name = irqchip_fwnode_get_name, }; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain * @type: Type of irqchip_fwnode. See linux/irqdomain.h * @id: Optional user provided id if name != NULL * @name: Optional user provided domain name * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a pointer to the embedded * fwnode_handle (or NULL on failure). * * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are * solely to transport name information to irqdomain creation code. The * node is not stored. For other types the pointer is kept in the irq * domain struct. */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, const char *name, phys_addr_t *pa) { struct irqchip_fwid *fwid; char *n; fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); switch (type) { case IRQCHIP_FWNODE_NAMED: n = kasprintf(GFP_KERNEL, "%s", name); break; case IRQCHIP_FWNODE_NAMED_ID: n = kasprintf(GFP_KERNEL, "%s-%d", name, id); break; default: n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa); break; } if (!fwid || !n) { kfree(fwid); kfree(n); return NULL; } fwid->type = type; fwid->name = n; fwid->pa = pa; fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops); return &fwid->fwnode; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle * @fwnode: fwnode_handle to free * * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. */ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); kfree(fwid->name); kfree(fwid); } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token) { if (bus_token == DOMAIN_BUS_ANY) domain->name = kasprintf(GFP_KERNEL, "%s", base); else domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token); if (!domain->name) return -ENOMEM; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token, const char *suffix) { const char *sep = suffix ? "-" : ""; const char *suf = suffix ? : ""; char *name; if (bus_token == DOMAIN_BUS_ANY) name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf); else name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token); if (!name) return -ENOMEM; /* * fwnode paths contain '/', which debugfs is legitimately unhappy * about. Replace them with ':', which does the trick and is not as * offensive as '\'... */ domain->name = strreplace(name, '/', ':'); domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { static atomic_t unknown_domains; int id = atomic_inc_return(&unknown_domains); if (bus_token == DOMAIN_BUS_ANY) domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id); else domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token); if (!domain->name) return -ENOMEM; domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; return 0; } static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info) { enum irq_domain_bus_token bus_token = info->bus_token; const struct fwnode_handle *fwnode = info->fwnode; if (is_fwnode_irqchip(fwnode)) { struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); /* * The name_suffix is only intended to be used to avoid a name * collision when multiple domains are created for a single * device and the name is picked using a real device node. * (Typical use-case is regmap-IRQ controllers for devices * providing more than one physical IRQ.) There should be no * need to use name_suffix with irqchip-fwnode. */ if (info->name_suffix) return -EINVAL; switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: return alloc_name(domain, fwid->name, bus_token); default: domain->name = fwid->name; if (bus_token != DOMAIN_BUS_ANY) return alloc_name(domain, fwid->name, bus_token); } } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) { return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix); } if (domain->name) return 0; if (fwnode) pr_err("Invalid fwnode type for irqdomain\n"); return alloc_unknown_name(domain, bus_token); } static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info) { struct irq_domain *domain; int err; if (WARN_ON((info->size && info->direct_max) || (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) || (info->direct_max && info->direct_max != info->hwirq_max))) return ERR_PTR(-EINVAL); domain = kzalloc_node(struct_size(domain, revmap, info->size), GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode))); if (!domain) return ERR_PTR(-ENOMEM); err = irq_domain_set_name(domain, info); if (err) { kfree(domain); return ERR_PTR(err); } domain->fwnode = fwnode_handle_get(info->fwnode); fwnode_dev_initialized(domain->fwnode, true); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = info->ops; domain->host_data = info->host_data; domain->bus_token = info->bus_token; domain->hwirq_max = info->hwirq_max; if (info->direct_max) domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; domain->revmap_size = info->size; /* * Hierarchical domains use the domain lock of the root domain * (innermost domain). * * For non-hierarchical domains (as for root domains), the root * pointer is set to the domain itself so that &domain->root->mutex * always points to the right lock. */ mutex_init(&domain->mutex); domain->root = domain; irq_domain_check_hierarchy(domain); return domain; } static void __irq_domain_publish(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); pr_debug("Added domain %s\n", domain->name); } static void irq_domain_free(struct irq_domain *domain) { fwnode_dev_initialized(domain->fwnode, false); fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); } static void irq_domain_instantiate_descs(const struct irq_domain_info *info) { if (!IS_ENABLED(CONFIG_SPARSE_IRQ)) return; if (irq_alloc_descs(info->virq_base, info->virq_base, info->size, of_node_to_nid(to_of_node(info->fwnode))) < 0) { pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", info->virq_base); } } static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info, bool cond_alloc_descs, bool force_associate) { struct irq_domain *domain; int err; domain = __irq_domain_create(info); if (IS_ERR(domain)) return domain; domain->flags |= info->domain_flags; domain->exit = info->exit; domain->dev = info->dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (info->parent) { domain->root = info->parent->root; domain->parent = info->parent; } #endif if (info->dgc_info) { err = irq_domain_alloc_generic_chips(domain, info->dgc_info); if (err) goto err_domain_free; } if (info->init) { err = info->init(domain); if (err) goto err_domain_gc_remove; } __irq_domain_publish(domain); if (cond_alloc_descs && info->virq_base > 0) irq_domain_instantiate_descs(info); /* * Legacy interrupt domains have a fixed Linux interrupt number * associated. Other interrupt domains can request association by * providing a Linux interrupt number > 0. */ if (force_associate || info->virq_base > 0) { irq_domain_associate_many(domain, info->virq_base, info->hwirq_base, info->size - info->hwirq_base); } return domain; err_domain_gc_remove: if (info->dgc_info) irq_domain_remove_generic_chips(domain); err_domain_free: irq_domain_free(domain); return ERR_PTR(err); } /** * irq_domain_instantiate() - Instantiate a new irq domain data structure * @info: Domain information pointer pointing to the information for this domain * * Return: A pointer to the instantiated irq domain or an ERR_PTR value. */ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) { return __irq_domain_instantiate(info, false, false); } EXPORT_SYMBOL_GPL(irq_domain_instantiate); /** * irq_domain_remove() - Remove an irq domain. * @domain: domain to remove * * This routine is used to remove an irq domain. The caller must ensure * that all mappings within the domain have been disposed of prior to * use, depending on the revmap type. */ void irq_domain_remove(struct irq_domain *domain) { if (domain->exit) domain->exit(domain); mutex_lock(&irq_domain_mutex); debugfs_remove_domain_dir(domain); WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); /* * If the going away domain is the default one, reset it. */ if (unlikely(irq_default_domain == domain)) irq_set_default_domain(NULL); mutex_unlock(&irq_domain_mutex); if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC) irq_domain_remove_generic_chips(domain); pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); void irq_domain_update_bus_token(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { char *name; if (domain->bus_token == bus_token) return; mutex_lock(&irq_domain_mutex); domain->bus_token = bus_token; name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token); if (!name) { mutex_unlock(&irq_domain_mutex); return; } debugfs_remove_domain_dir(domain); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); else domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; domain->name = name; debugfs_add_domain_dir(domain); mutex_unlock(&irq_domain_mutex); } EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs * @fwnode: firmware node for the interrupt controller * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most * interrupt controllers. If device tree is used, then first_irq will be 0 and * irqs get mapped dynamically on the fly. However, if the controller requires * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { .fwnode = fwnode, .size = size, .hwirq_max = size, .virq_base = first_irq, .ops = ops, .host_data = host_data, }; struct irq_domain *domain = __irq_domain_instantiate(&info, true, false); return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_simple); struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { .fwnode = fwnode, .size = first_hwirq + size, .hwirq_max = first_hwirq + size, .hwirq_base = first_hwirq, .virq_base = first_irq, .ops = ops, .host_data = host_data, }; struct irq_domain *domain = __irq_domain_instantiate(&info, false, true); return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_legacy); /** * irq_find_matching_fwspec() - Locates a domain for a given fwspec * @fwspec: FW specifier for an interrupt * @bus_token: domain-specific data */ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; struct fwnode_handle *fwnode = fwspec->fwnode; int rc; /* * We might want to match the legacy controller last since * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... * * bus_token == DOMAIN_BUS_ANY matches any domain, any other * values must generate an exact match for the domain to be * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->select && bus_token != DOMAIN_BUS_ANY) rc = h->ops->select(h, fwspec, bus_token); else if (h->ops->match) rc = h->ops->match(h, to_of_node(fwnode), bus_token); else rc = ((fwnode != NULL) && (h->fwnode == fwnode) && ((bus_token == DOMAIN_BUS_ANY) || (h->bus_token == bus_token))); if (rc) { found = h; break; } } mutex_unlock(&irq_domain_mutex); return found; } EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** * irq_set_default_domain() - Set a "default" irq domain * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used * whenever NULL is passed to irq_create_mapping(). It makes life easier for * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ void irq_set_default_domain(struct irq_domain *domain) { pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } EXPORT_SYMBOL_GPL(irq_set_default_domain); /** * irq_get_default_domain() - Retrieve the "default" irq domain * * Returns: the default domain, if any. * * Modern code should never use this. This should only be used on * systems that cannot implement a firmware->fwnode mapping (which * both DT and ACPI provide). */ struct irq_domain *irq_get_default_domain(void) { return irq_default_domain; } EXPORT_SYMBOL_GPL(irq_get_default_domain); static bool irq_domain_is_nomap(struct irq_domain *domain) { return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP); } static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], NULL); else radix_tree_delete(&domain->revmap_tree, hwirq); } static void irq_domain_set_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_data *irq_data) { /* * This also makes sure that all domains point to the same root when * called from irq_domain_insert_irq() for each domain in a hierarchy. */ lockdep_assert_held(&domain->root->mutex); if (irq_domain_is_nomap(domain)) return; if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], irq_data); else radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); } static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; if (WARN(!irq_data || irq_data->domain != domain, "virq%i doesn't exist; cannot disassociate\n", irq)) return; hwirq = irq_data->hwirq; mutex_lock(&domain->root->mutex); irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ irq_set_chip_and_handler(irq, NULL, NULL); /* Make sure it's completed */ synchronize_irq(irq); /* Tell the PIC about it */ if (domain->ops->unmap) domain->ops->unmap(domain, irq); smp_mb(); irq_data->domain = NULL; irq_data->hwirq = 0; domain->mapcount--; /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); mutex_unlock(&domain->root->mutex); } static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); int ret; if (WARN(hwirq >= domain->hwirq_max, "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) return -EINVAL; if (WARN(!irq_data, "error: virq%i is not allocated", virq)) return -EINVAL; if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) return -EINVAL; irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map) { ret = domain->ops->map(domain, virq, hwirq); if (ret != 0) { /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; return ret; } } domain->mapcount++; irq_domain_set_mapping(domain, hwirq, irq_data); irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } int irq_domain_associate(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { int ret; mutex_lock(&domain->root->mutex); ret = irq_domain_associate_locked(domain, virq, hwirq); mutex_unlock(&domain->root->mutex); return ret; } EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { struct device_node *of_node; int i; of_node = irq_domain_get_of_node(domain); pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, of_node_full_name(of_node), irq_base, (int)hwirq_base, count); for (i = 0; i < count; i++) irq_domain_associate(domain, irq_base + i, hwirq_base + i); } EXPORT_SYMBOL_GPL(irq_domain_associate_many); #ifdef CONFIG_IRQ_DOMAIN_NOMAP /** * irq_create_direct_mapping() - Allocate an irq for direct mapping * @domain: domain to allocate the irq for or NULL for default domain * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use * the linux irq as the hardware interrupt number. It still uses the linear * or radix tree to store the mapping, but the irq controller can optimize * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { struct device_node *of_node; unsigned int virq; if (domain == NULL) domain = irq_default_domain; of_node = irq_domain_get_of_node(domain); virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } if (virq >= domain->hwirq_max) { pr_err("ERROR: no free irqs available below %lu maximum\n", domain->hwirq_max); irq_free_desc(virq); return 0; } pr_debug("create_direct obtained virq %d\n", virq); if (irq_domain_associate(domain, virq, virq)) { irq_free_desc(virq); return 0; } return virq; } EXPORT_SYMBOL_GPL(irq_create_direct_mapping); #endif static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity) { struct device_node *of_node = irq_domain_get_of_node(domain); int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); /* Allocate a virtual interrupt number */ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; } if (irq_domain_associate_locked(domain, virq, hwirq)) { irq_free_desc(virq); return 0; } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", hwirq, of_node_full_name(of_node), virq); return virq; } /** * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity) { int virq; /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } mutex_lock(&domain->root->mutex); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { pr_debug("existing mapping on virq %d\n", virq); goto out; } virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); out: mutex_unlock(&domain->root->mutex); return virq; } EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); static int irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type) { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (d->ops->translate) return d->ops->translate(d, fwspec, hwirq, type); #endif if (d->ops->xlate) return d->ops->xlate(d, to_of_node(fwspec->fwnode), fwspec->param, fwspec->param_count, hwirq, type); /* If domain has no translation, then we assume interrupt line */ *hwirq = fwspec->param[0]; return 0; } void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec) { int i; fwspec->fwnode = of_fwnode_handle(np); fwspec->param_count = count; for (i = 0; i < count; i++) fwspec->param[i] = args[i]; } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); if (!domain) domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY); } else { domain = irq_default_domain; } return domain; } #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) { struct irq_domain *domain = fwspec_to_domain(fwspec); memset(info, 0, sizeof(*info)); if (!domain || !domain->ops->get_fwspec_info) return 0; return domain->ops->get_fwspec_info(fwspec, info); } #endif unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { unsigned int type = IRQ_TYPE_NONE; struct irq_domain *domain; struct irq_data *irq_data; irq_hw_number_t hwirq; int virq; domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); return 0; } if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; /* * WARN if the irqchip returns a type with bits * outside the sense mask set and clear these bits. */ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) type &= IRQ_TYPE_SENSE_MASK; mutex_lock(&domain->root->mutex); /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. */ virq = irq_find_mapping(domain, hwirq); if (virq) { /* * If the trigger type is not specified or matches the * current trigger type then we are done so return the * interrupt number. */ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) goto out; /* * If the trigger type has not been set yet, then set * it now and return the interrupt number. */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { irq_data = irq_get_irq_data(virq); if (!irq_data) { virq = 0; goto out; } irqd_set_trigger_type(irq_data, type); goto out; } pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); virq = 0; goto out; } if (irq_domain_is_hierarchy(domain)) { if (irq_domain_is_msi_device(domain)) { mutex_unlock(&domain->root->mutex); virq = msi_device_domain_alloc_wired(domain, hwirq, type); mutex_lock(&domain->root->mutex); } else virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, fwspec, false, NULL); if (virq <= 0) { virq = 0; goto out; } } else { /* Create mapping */ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL); if (!virq) goto out; } irq_data = irq_get_irq_data(virq); if (WARN_ON(!irq_data)) { virq = 0; goto out; } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); out: mutex_unlock(&domain->root->mutex); return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(irq_data->np, irq_data->args, irq_data->args_count, &fwspec); return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** * irq_dispose_mapping() - Unmap an interrupt * @virq: linux irq number of the interrupt to unmap */ void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data; struct irq_domain *domain; irq_data = virq ? irq_get_irq_data(virq) : NULL; if (!irq_data) return; domain = irq_data->domain; if (WARN_ON(domain == NULL)) return; if (irq_domain_is_hierarchy(domain)) { irq_domain_free_one_irq(domain, virq); } else { irq_domain_disassociate(domain, virq); irq_free_desc(virq); } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** * __irq_resolve_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space * @irq: optional pointer to return the Linux irq if required * * Returns the interrupt descriptor. */ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, unsigned int *irq) { struct irq_desc *desc = NULL; struct irq_data *data; /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; if (domain == NULL) return desc; if (irq_domain_is_nomap(domain)) { if (hwirq < domain->hwirq_max) { data = irq_domain_get_irq_data(domain, hwirq); if (data && data->hwirq == hwirq) desc = irq_data_to_desc(data); if (irq && desc) *irq = hwirq; } return desc; } rcu_read_lock(); /* Check if the hwirq is in the linear revmap. */ if (hwirq < domain->revmap_size) data = rcu_dereference(domain->revmap[hwirq]); else data = radix_tree_lookup(&domain->revmap_tree, hwirq); if (likely(data)) { desc = irq_data_to_desc(data); if (irq) *irq = data->irq; } rcu_read_unlock(); return desc; } EXPORT_SYMBOL_GPL(__irq_resolve_mapping); /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with one cell * bindings where the cell value maps directly to the hwirq number. */ int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); /** * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with two cell * bindings where the cell values map directly to the hwirq number * and linux irq flags. */ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); /** * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree interrupt specifier translation function for two or three * cell bindings, where the cell values map directly to the hardware * interrupt number and the type specifier. */ int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { struct irq_fwspec fwspec; of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); /** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated * @intspec: The interrupt specifier data from the device tree * @intsize: The number of entries in @intspec * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with either one * or two cell bindings where the cell values map directly to the hwirq number * and linux irq flags. * * Note: don't use this function unless your interrupt controller explicitly * supports both one and two cell bindings. For the majority of controllers * the _onecell() or _twocell() variants above should be used. */ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; if (intsize > 1) *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; else *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); /** * irq_domain_translate_onecell() - Generic translate for direct one cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type */ int irq_domain_translate_onecell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(fwspec->param_count < 1)) return -EINVAL; *out_hwirq = fwspec->param[0]; *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_translate_onecell); /** * irq_domain_translate_twocell() - Generic translate for direct two cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Device Tree IRQ specifier translation function which works with two cell * bindings where the cell values map directly to the hwirq number * and linux irq flags. */ int irq_domain_translate_twocell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (WARN_ON(fwspec->param_count < 2)) return -EINVAL; *out_hwirq = fwspec->param[0]; *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); /** * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell * bindings * @d: Interrupt domain involved in the translation * @fwspec: The firmware interrupt specifier to translate * @out_hwirq: Pointer to storage for the hardware interrupt number * @out_type: Pointer to storage for the interrupt type * * Firmware interrupt specifier translation function for two or three cell * specifications, where the parameter values map directly to the hardware * interrupt number and the type specifier. */ int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type) { if (fwspec->param_count == 2) { *out_hwirq = fwspec->param[0]; *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; return 0; } if (fwspec->param_count == 3) { *out_hwirq = fwspec->param[1]; *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { unsigned int hint; if (virq >= 0) { virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, affinity); } else { hint = hwirq % irq_get_nr_irqs(); if (hint == 0) hint++; virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, affinity); if (virq <= 0 && hint > 1) { virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, affinity); } } return virq; } /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data * @irq_data: The pointer to irq_data */ void irq_domain_reset_irq_data(struct irq_data *irq_data) { irq_data->hwirq = 0; irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY static void irq_domain_insert_irq(int virq) { struct irq_data *data; for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; domain->mapcount++; irq_domain_set_mapping(domain, data->hwirq, data); } irq_clear_status_flags(virq, IRQ_NOREQUEST); } static void irq_domain_remove_irq(int virq) { struct irq_data *data; irq_set_status_flags(virq, IRQ_NOREQUEST); irq_set_chip_and_handler(virq, NULL, NULL); synchronize_irq(virq); smp_mb(); for (data = irq_get_irq_data(virq); data; data = data->parent_data) { struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; domain->mapcount--; irq_domain_clear_mapping(domain, hwirq); } } static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, struct irq_data *child) { struct irq_data *irq_data; irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, irq_data_get_node(child)); if (irq_data) { child->parent_data = irq_data; irq_data->irq = child->irq; irq_data->common = child->common; irq_data->domain = domain; } return irq_data; } static void __irq_domain_free_hierarchy(struct irq_data *irq_data) { struct irq_data *tmp; while (irq_data) { tmp = irq_data; irq_data = irq_data->parent_data; kfree(tmp); } } static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_get_irq_data(virq + i); tmp = irq_data->parent_data; irq_data->parent_data = NULL; irq_data->domain = NULL; __irq_domain_free_hierarchy(tmp); } } /** * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy * @domain: IRQ domain from which the hierarchy is to be disconnected * @virq: IRQ number where the hierarchy is to be trimmed * * Marks the @virq level belonging to @domain as disconnected. * Returns -EINVAL if @virq doesn't have a valid irq_data pointing * to @domain. * * Its only use is to be able to trim levels of hierarchy that do not * have any real meaning for this interrupt, and that the driver marks * as such from its .alloc() callback. */ int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq) { struct irq_data *irqd; irqd = irq_domain_get_irq_data(domain, virq); if (!irqd) return -EINVAL; irqd->chip = ERR_PTR(-ENOTCONN); return 0; } EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy); static int irq_domain_trim_hierarchy(unsigned int virq) { struct irq_data *tail, *irqd, *irq_data; irq_data = irq_get_irq_data(virq); tail = NULL; /* The first entry must have a valid irqchip */ if (IS_ERR_OR_NULL(irq_data->chip)) return -EINVAL; /* * Validate that the irq_data chain is sane in the presence of * a hierarchy trimming marker. */ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { /* Can't have a valid irqchip after a trim marker */ if (irqd->chip && tail) return -EINVAL; /* Can't have an empty irqchip before a trim marker */ if (!irqd->chip && !tail) return -EINVAL; if (IS_ERR(irqd->chip)) { /* Only -ENOTCONN is a valid trim marker */ if (PTR_ERR(irqd->chip) != -ENOTCONN) return -EINVAL; tail = irq_data; } } /* No trim marker, nothing to do */ if (!tail) return 0; pr_info("IRQ%d: trimming hierarchy from %s\n", virq, tail->parent_data->domain->name); /* Sever the inner part of the hierarchy... */ irqd = tail; tail = tail->parent_data; irqd->parent_data = NULL; __irq_domain_free_hierarchy(tail); return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data; struct irq_domain *parent; int i; /* The outermost irq_data is embedded in struct irq_desc */ for (i = 0; i < nr_irqs; i++) { irq_data = irq_get_irq_data(virq + i); irq_data->domain = domain; for (parent = domain->parent; parent; parent = parent->parent) { irq_data = irq_domain_insert_irq_data(parent, irq_data); if (!irq_data) { irq_domain_free_irq_data(virq, i + 1); return -ENOMEM; } } } return 0; } /** * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data */ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq) { struct irq_data *irq_data; for (irq_data = irq_get_irq_data(virq); irq_data; irq_data = irq_data->parent_data) if (irq_data->domain == domain) return irq_data; return NULL; } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hwirq number * @chip: The associated interrupt chip * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); if (!irq_data) return -ENOENT; irq_data->hwirq = hwirq; irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; } EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hardware interrupt number * @chip: The associated interrupt chip * @chip_data: The associated interrupt chip data * @handler: The interrupt flow handler * @handler_data: The interrupt flow handler data * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match * @virq: IRQ number to start with * @nr_irqs: The number of irqs to free */ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data; int i; for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); if (irq_data) irq_domain_reset_irq_data(irq_data); } irq_domain_free_irqs_parent(domain, virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common); /** * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent * @domain: Interrupt domain to match * @virq: IRQ number to start with * @nr_irqs: The number of irqs to free */ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { int i; for (i = 0; i < nr_irqs; i++) { irq_set_handler_data(virq + i, NULL); irq_set_handler(virq + i, NULL); } irq_domain_free_irqs_common(domain, virq, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top); static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { unsigned int i; if (!domain->ops->free) return; for (i = 0; i < nr_irqs; i++) { if (irq_domain_get_irq_data(domain, irq_base + i)) domain->ops->free(domain, irq_base + i, 1); } } static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); return -ENOSYS; } return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; if (realloc && irq_base >= 0) { virq = irq_base; } else { virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); return virq; } } if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { pr_debug("cannot allocate memory for IRQ%d\n", virq); ret = -ENOMEM; goto out_free_desc; } ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); if (ret < 0) goto out_free_irq_data; for (i = 0; i < nr_irqs; i++) { ret = irq_domain_trim_hierarchy(virq + i); if (ret) goto out_free_irq_data; } for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); return virq; out_free_irq_data: irq_domain_free_irq_data(virq, nr_irqs); out_free_desc: irq_free_descs(virq, nr_irqs); return ret; } /** * __irq_domain_alloc_irqs - Allocate IRQs from domain * @domain: domain to allocate from * @irq_base: allocate specified IRQ number if irq_base >= 0 * @nr_irqs: number of IRQs to allocate * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. * Parameter @realloc is mainly to support legacy IRQs. * Returns error code or allocated IRQ number * * The whole process to setup an IRQ has been split into two steps. * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ * descriptor and required hardware resources. The second step, * irq_domain_activate_irq(), is to program the hardware with preallocated * resources. In this way, it's easier to rollback when failing to * allocate resources. */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { int ret; if (domain == NULL) { domain = irq_default_domain; if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) return -EINVAL; } mutex_lock(&domain->root->mutex); ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, realloc, affinity); mutex_unlock(&domain->root->mutex); return ret; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { void __rcu **slot; lockdep_assert_held(&d->domain->root->mutex); if (irq_domain_is_nomap(d->domain)) return; /* Fix up the revmap. */ if (d->hwirq < d->domain->revmap_size) { /* Not using radix tree */ rcu_assign_pointer(d->domain->revmap[d->hwirq], d); } else { slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq); if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); } } /** * irq_domain_push_irq() - Push a domain in to the top of a hierarchy. * @domain: Domain to push. * @virq: Irq to push the domain in to. * @arg: Passed to the irq_domain_ops alloc() function. * * For an already existing irqdomain hierarchy, as might be obtained * via a call to pci_enable_msix(), add an additional domain to the * head of the processing chain. Must be called before request_irq() * has been called. */ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_data *parent_irq_data; struct irq_desc *desc; int rv = 0; /* * Check that no action has been set, which indicates the virq * is in a state where this function doesn't have to deal with * races between interrupt handling and maintaining the * hierarchy. This will catch gross misuse. Attempting to * make the check race free would require holding locks across * calls to struct irq_domain_ops->alloc(), which could lead * to deadlock, so we just do a simple check before starting. */ desc = irq_to_desc(virq); if (!desc) return -EINVAL; if (WARN_ON(desc->action)) return -EBUSY; if (domain == NULL) return -EINVAL; if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; if (!irq_data) return -EINVAL; if (domain->parent != irq_data->domain) return -EINVAL; parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL, irq_data_get_node(irq_data)); if (!parent_irq_data) return -ENOMEM; mutex_lock(&domain->root->mutex); /* Copy the original irq_data. */ *parent_irq_data = *irq_data; /* * Overwrite the irq_data, which is embedded in struct irq_desc, with * values for this domain. */ irq_data->parent_data = parent_irq_data; irq_data->domain = domain; irq_data->mask = 0; irq_data->hwirq = 0; irq_data->chip = NULL; irq_data->chip_data = NULL; /* May (probably does) set hwirq, chip, etc. */ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (rv) { /* Restore the original irq_data. */ *irq_data = *parent_irq_data; kfree(parent_irq_data); goto error; } irq_domain_fix_revmap(parent_irq_data); irq_domain_set_mapping(domain, irq_data->hwirq, irq_data); error: mutex_unlock(&domain->root->mutex); return rv; } EXPORT_SYMBOL_GPL(irq_domain_push_irq); /** * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy. * @domain: Domain to remove. * @virq: Irq to remove the domain from. * * Undo the effects of a call to irq_domain_push_irq(). Must be * called either before request_irq() or after free_irq(). */ int irq_domain_pop_irq(struct irq_domain *domain, int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_data *parent_irq_data; struct irq_data *tmp_irq_data; struct irq_desc *desc; /* * Check that no action is set, which indicates the virq is in * a state where this function doesn't have to deal with races * between interrupt handling and maintaining the hierarchy. * This will catch gross misuse. Attempting to make the check * race free would require holding locks across calls to * struct irq_domain_ops->free(), which could lead to * deadlock, so we just do a simple check before starting. */ desc = irq_to_desc(virq); if (!desc) return -EINVAL; if (WARN_ON(desc->action)) return -EBUSY; if (domain == NULL) return -EINVAL; if (!irq_data) return -EINVAL; tmp_irq_data = irq_domain_get_irq_data(domain, virq); /* We can only "pop" if this domain is at the top of the list */ if (WARN_ON(irq_data != tmp_irq_data)) return -EINVAL; if (WARN_ON(irq_data->domain != domain)) return -EINVAL; parent_irq_data = irq_data->parent_data; if (WARN_ON(!parent_irq_data)) return -EINVAL; mutex_lock(&domain->root->mutex); irq_data->parent_data = NULL; irq_domain_clear_mapping(domain, irq_data->hwirq); irq_domain_free_irqs_hierarchy(domain, virq, 1); /* Restore the original irq_data. */ *irq_data = *parent_irq_data; irq_domain_fix_revmap(irq_data); mutex_unlock(&domain->root->mutex); kfree(parent_irq_data); return 0; } EXPORT_SYMBOL_GPL(irq_domain_pop_irq); /** * irq_domain_free_irqs - Free IRQ number and associated data structures * @virq: base IRQ number * @nr_irqs: number of IRQs to free */ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) { struct irq_data *data = irq_get_irq_data(virq); struct irq_domain *domain; int i; if (WARN(!data || !data->domain || !data->domain->ops->free, "NULL pointer, cannot free irq\n")) return; domain = data->domain; mutex_lock(&domain->root->mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs); mutex_unlock(&domain->root->mutex); irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); } static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { if (irq_domain_is_msi_device(domain)) msi_device_domain_free_wired(domain, virq); else irq_domain_free_irqs(virq, 1); } /** * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain * @domain: Domain below which interrupts must be allocated * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to allocate * @arg: Allocation data (arch/domain specific) */ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { if (!domain->parent) return -ENOSYS; return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base, nr_irqs, arg); } EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain * @domain: Domain below which interrupts must be freed * @irq_base: Base IRQ number * @nr_irqs: Number of IRQs to free */ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { if (!domain->parent) return; irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); static void __irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irq_data && irq_data->domain) { struct irq_domain *domain = irq_data->domain; if (domain->ops->deactivate) domain->ops->deactivate(domain, irq_data); if (irq_data->parent_data) __irq_domain_deactivate_irq(irq_data->parent_data); } } static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; if (irqd && irqd->domain) { struct irq_domain *domain = irqd->domain; if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, reserve); if (!ret && domain->ops->activate) { ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); } } return ret; } /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt * @irq_data: Outermost irq_data associated with interrupt * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; } /** * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to * deactivate interrupt * @irq_data: outermost irq_data associated with interrupt * * It calls domain_ops->deactivate to program interrupt controllers to disable * interrupt delivery. */ void irq_domain_deactivate_irq(struct irq_data *irq_data) { if (irqd_is_activated(irq_data)) { __irq_domain_deactivate_irq(irq_data); irqd_clr_activated(irq_data); } } static void irq_domain_check_hierarchy(struct irq_domain *domain) { /* Hierarchy irq_domains must implement callback alloc() */ if (domain->ops->alloc) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data */ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /* * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number * @hwirq: The hardware interrupt number * @chip: The associated interrupt chip * @chip_data: The associated interrupt chip data * @handler: The interrupt flow handler * @handler_data: The interrupt flow handler data * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { irq_set_chip_and_handler_name(virq, chip, handler, handler_name); irq_set_chip_data(virq, chip_data); irq_set_handler_data(virq, handler_data); } static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity) { return -EINVAL; } static void irq_domain_check_hierarchy(struct irq_domain *domain) { } static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS #include "internals.h" static struct dentry *domain_dir; static const struct irq_bit_descr irqdomain_flags[] = { BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY), BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE), BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE), }; static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) { seq_printf(m, "%*sname: %s\n", ind, "", d->name); seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size); seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags)); if (d->ops && d->ops->debug_show) d->ops->debug_show(m, d, NULL, ind + 1); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (!d->parent) return; seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name); irq_domain_debug_show_one(m, d->parent, ind + 4); #endif } static int irq_domain_debug_show(struct seq_file *m, void *p) { struct irq_domain *d = m->private; /* Default domain? Might be NULL */ if (!d) { if (!irq_default_domain) return 0; d = irq_default_domain; } irq_domain_debug_show_one(m, d, 0); return 0; } DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir) return; debugfs_create_file(d->name, 0444, domain_dir, d, &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) { debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) { struct irq_domain *d; domain_dir = debugfs_create_dir("domains", root); debugfs_create_file("default", 0444, domain_dir, NULL, &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); mutex_unlock(&irq_domain_mutex); } #endif
3 3 6 6 6 6 6 1 2 1 1 1 1 1 1 1 1 6 5 1 4 3 1 3 1 1 1 4 3 4 2 6 6 1 8 1 7 7 2 2 2 2 1 2 2 2 2 2 20 20 20 20 14 13 7 6 1 6 3 6 6 6 6 7 6 7 6 1 6 5 6 6 6 7 3 4 3 1 3 2 2 2 2 3 3 2 1 1 1 1 1 1 1 8 8 7 7 1 8 8 7 7 1 5 5 5 5 4 1 4 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 Nicira, Inc. */ #include <linux/module.h> #include <linux/openvswitch.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> #include <linux/static_key.h> #include <linux/string_helpers.h> #include <net/ip.h> #include <net/genetlink.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_count.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/ipv6_frag.h> #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #endif #include <net/netfilter/nf_conntrack_act_ct.h> #include "datapath.h" #include "drop.h" #include "conntrack.h" #include "flow.h" #include "flow_netlink.h" struct ovs_ct_len_tbl { int maxlen; int minlen; }; /* Metadata mark for masked write to conntrack mark */ struct md_mark { u32 value; u32 mask; }; /* Metadata label for masked write to conntrack label. */ struct md_labels { struct ovs_key_ct_labels value; struct ovs_key_ct_labels mask; }; enum ovs_ct_nat { OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ }; /* Conntrack action context for execution. */ struct ovs_conntrack_info { struct nf_conntrack_helper *helper; struct nf_conntrack_zone zone; struct nf_conn *ct; u8 commit : 1; u8 nat : 3; /* enum ovs_ct_nat */ u8 force : 1; u8 have_eventmask : 1; u16 family; u32 eventmask; /* Mask of 1 << IPCT_*. */ struct md_mark mark; struct md_labels labels; char timeout[CTNL_TIMEOUT_NAME_MAX]; struct nf_ct_timeout *nf_ct_timeout; #if IS_ENABLED(CONFIG_NF_NAT) struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif }; #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) #define OVS_CT_LIMIT_UNLIMITED 0 #define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED #define CT_LIMIT_HASH_BUCKETS 512 static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled); struct ovs_ct_limit { /* Elements in ovs_ct_limit_info->limits hash table */ struct hlist_node hlist_node; struct rcu_head rcu; u16 zone; u32 limit; }; struct ovs_ct_limit_info { u32 default_limit; struct hlist_head *limits; struct nf_conncount_data *data; }; static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = { [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, }, }; #endif static bool labels_nonzero(const struct ovs_key_ct_labels *labels); static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); static u16 key_to_nfproto(const struct sw_flow_key *key) { switch (ntohs(key->eth.type)) { case ETH_P_IP: return NFPROTO_IPV4; case ETH_P_IPV6: return NFPROTO_IPV6; default: return NFPROTO_UNSPEC; } } /* Map SKB connection state into the values used by flow definition. */ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) { u8 ct_state = OVS_CS_F_TRACKED; switch (ctinfo) { case IP_CT_ESTABLISHED_REPLY: case IP_CT_RELATED_REPLY: ct_state |= OVS_CS_F_REPLY_DIR; break; default: break; } switch (ctinfo) { case IP_CT_ESTABLISHED: case IP_CT_ESTABLISHED_REPLY: ct_state |= OVS_CS_F_ESTABLISHED; break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: ct_state |= OVS_CS_F_RELATED; break; case IP_CT_NEW: ct_state |= OVS_CS_F_NEW; break; default: break; } return ct_state; } static u32 ovs_ct_get_mark(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) return ct ? READ_ONCE(ct->mark) : 0; #else return 0; #endif } /* Guard against conntrack labels max size shrinking below 128 bits. */ #if NF_CT_LABELS_MAX_SIZE < 16 #error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes #endif static void ovs_ct_get_labels(const struct nf_conn *ct, struct ovs_key_ct_labels *labels) { struct nf_conn_labels *cl = NULL; if (ct) { if (ct->master && !nf_ct_is_confirmed(ct)) ct = ct->master; cl = nf_ct_labels_find(ct); } if (cl) memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); else memset(labels, 0, OVS_CT_LABELS_LEN); } static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, const struct nf_conntrack_tuple *orig, u8 icmp_proto) { key->ct_orig_proto = orig->dst.protonum; if (orig->dst.protonum == icmp_proto) { key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); } else { key->ct.orig_tp.src = orig->src.u.all; key->ct.orig_tp.dst = orig->dst.u.all; } } static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, const struct nf_conntrack_zone *zone, const struct nf_conn *ct) { key->ct_state = state; key->ct_zone = zone->id; key->ct.mark = ovs_ct_get_mark(ct); ovs_ct_get_labels(ct, &key->ct.labels); if (ct) { const struct nf_conntrack_tuple *orig; /* Use the master if we have one. */ if (ct->master) ct = ct->master; orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* IP version must match with the master connection. */ if (key->eth.type == htons(ETH_P_IP) && nf_ct_l3num(ct) == NFPROTO_IPV4) { key->ipv4.ct_orig.src = orig->src.u3.ip; key->ipv4.ct_orig.dst = orig->dst.u3.ip; __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); return; } else if (key->eth.type == htons(ETH_P_IPV6) && !sw_flow_key_is_nd(key) && nf_ct_l3num(ct) == NFPROTO_IPV6) { key->ipv6.ct_orig.src = orig->src.u3.in6; key->ipv6.ct_orig.dst = orig->dst.u3.in6; __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); return; } } /* Clear 'ct_orig_proto' to mark the non-existence of conntrack * original direction key fields. */ key->ct_orig_proto = 0; } /* Update 'key' based on sk