Total coverage: 375543 (18%)of 2093325
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs_platform.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" #include "xfs_exchrange.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ /* * Check that all the V4 feature bits that the V5 filesystem format requires are * correctly set. */ static bool xfs_sb_validate_v5_features( struct xfs_sb *sbp) { /* We must not have any unknown V4 feature bits set */ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) return false; /* * The CRC bit is considered an invalid V4 flag, so we have to add it * manually to the OKBITS mask. */ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | XFS_SB_VERSION2_CRCBIT)) return false; /* Now check all the required V4 feature flags are set. */ #define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ XFS_SB_VERSION_ALIGNBIT | \ XFS_SB_VERSION_LOGV2BIT | \ XFS_SB_VERSION_EXTFLGBIT | \ XFS_SB_VERSION_DIRV2BIT | \ XFS_SB_VERSION_MOREBITSBIT) #define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ XFS_SB_VERSION2_ATTR2BIT | \ XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_CRCBIT) if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) return false; if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) return false; return true; } /* * We current support XFS v5 formats with known features and v4 superblocks with * at least V2 directories. */ bool xfs_sb_good_version( struct xfs_sb *sbp) { /* * All v5 filesystems are supported, but we must check that all the * required v4 feature flags are enabled correctly as the code checks * those flags and not for v5 support. */ if (xfs_sb_is_v5(sbp)) return xfs_sb_validate_v5_features(sbp); /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4) return false; /* We must not have any unknown v4 feature bits set */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) return false; /* V4 filesystems need v2 directories and unwritten extents */ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; /* It's a supported v4 filesystem */ return true; } uint64_t xfs_sb_version_to_features( struct xfs_sb *sbp) { uint64_t features = 0; /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) features |= XFS_FEAT_QUOTA; if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT) features |= XFS_FEAT_ALIGN; if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT) features |= XFS_FEAT_LOGV2; if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT) features |= XFS_FEAT_DALIGN; if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT) features |= XFS_FEAT_EXTFLG; if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) features |= XFS_FEAT_SECTOR; if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT) features |= XFS_FEAT_ASCIICI; if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) features |= XFS_FEAT_FTYPE; } if (!xfs_sb_is_v5(sbp)) return features; /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT) features |= XFS_FEAT_FINOBT; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT) features |= XFS_FEAT_RMAPBT; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK) features |= XFS_FEAT_REFLINK; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) features |= XFS_FEAT_INOBTCNT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) features |= XFS_FEAT_FTYPE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) features |= XFS_FEAT_SPINODES; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) features |= XFS_FEAT_META_UUID; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME) features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) features |= XFS_FEAT_NREXT64; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE) features |= XFS_FEAT_EXCHANGE_RANGE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) features |= XFS_FEAT_PARENT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) features |= XFS_FEAT_METADIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) features |= XFS_FEAT_ZONED; return features; } /* Check all the superblock fields we care about when reading one in. */ STATIC int xfs_validate_sb_read( struct xfs_mount *mp, struct xfs_sb *sbp) { if (!xfs_sb_is_v5(sbp)) return 0; /* * Version 5 superblock feature mask validation. Reject combinations * the kernel cannot support up front before checking anything else. */ if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, "Superblock has unknown compatible features (0x%x) enabled.", (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); xfs_warn(mp, "Using a more recent kernel is recommended."); } if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Superblock has unknown read-only compatible features (0x%x) enabled.", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); if (!xfs_is_readonly(mp)) { xfs_warn(mp, "Attempted to mount read-only compatible filesystem read-write."); xfs_warn(mp, "Filesystem can only be safely mounted read only."); return -EINVAL; } } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, "Superblock has unknown incompatible features (0x%x) enabled.", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); xfs_warn(mp, "Filesystem cannot be safely mounted by this kernel."); return -EINVAL; } return 0; } /* Return the number of extents covered by a single rt bitmap file */ static xfs_rtbxlen_t xfs_extents_per_rbm( struct xfs_sb *sbp) { if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) return sbp->sb_rgextents; return sbp->sb_rextents; } /* * Return the payload size of a single rt bitmap block (without the metadata * header if any). */ static inline unsigned int xfs_rtbmblock_size( struct xfs_sb *sbp) { if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo); return sbp->sb_blocksize; } static uint64_t xfs_expected_rbmblocks( struct xfs_sb *sbp) { if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) return 0; return howmany_64(xfs_extents_per_rbm(sbp), NBBY * xfs_rtbmblock_size(sbp)); } /* Validate the realtime geometry */ bool xfs_validate_rt_geometry( struct xfs_sb *sbp) { if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { if (sbp->sb_rextsize != 1) return false; } else { if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) return false; } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) return false; return true; } if (sbp->sb_rextents == 0 || sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) || sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) || sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp)) return false; if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { uint32_t mod; /* * Zoned RT devices must be aligned to the RT group size, * because garbage collection assumes that all zones have the * same size to avoid insane complexity if that weren't the * case. */ div_u64_rem(sbp->sb_rextents, sbp->sb_rgextents, &mod); if (mod) return false; } return true; } /* Check all the superblock fields we care about when writing one out. */ STATIC int xfs_validate_sb_write( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_sb *sbp) { /* * Carry out additional sb summary counter sanity checks when we write * the superblock. We skip this in the read validator because there * could be newer superblocks in the log and if the values are garbage * even after replay we'll recalculate them at the end of log mount. * * mkfs has traditionally written zeroed counters to inprogress and * secondary superblocks, so allow this usage to continue because * we never read counters from such superblocks. */ if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && (sbp->sb_fdblocks > sbp->sb_dblocks || !xfs_verify_icount(mp, sbp->sb_icount) || sbp->sb_ifree > sbp->sb_icount)) { xfs_warn(mp, "SB summary counter sanity check failed"); return -EFSCORRUPTED; } if (!xfs_sb_is_v5(sbp)) return 0; /* * Version 5 superblock feature mask validation. Reject combinations * the kernel cannot support since we checked for unsupported bits in * the read verifier, which means that memory is corrupt. */ if (!xfs_is_readonly(mp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Corruption detected in superblock read-only compatible features (0x%x)!", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); return -EFSCORRUPTED; } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, "Corruption detected in superblock incompatible features (0x%x)!", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); return -EFSCORRUPTED; } if (xfs_sb_has_incompat_log_feature(sbp, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(mp, "Corruption detected in superblock incompatible log features (0x%x)!", (sbp->sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); return -EFSCORRUPTED; } /* * We can't read verify the sb LSN because the read verifier is called * before the log is allocated and processed. We know the log is set up * before write verifier calls, so check it here. */ if (!xfs_log_check_lsn(mp, sbp->sb_lsn)) return -EFSCORRUPTED; return 0; } int xfs_compute_rgblklog( xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize) { uint64_t rgblocks = (uint64_t)rgextents * rextsize; return xfs_highbit64(rgblocks - 1) + 1; } static int xfs_validate_sb_rtgroups( struct xfs_mount *mp, struct xfs_sb *sbp) { uint64_t groups; int rgblklog; if (sbp->sb_rextsize == 0) { xfs_warn(mp, "Realtime extent size must not be zero."); return -EINVAL; } if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) { xfs_warn(mp, "Realtime group size (%u) must be less than %u rt extents.", sbp->sb_rgextents, XFS_MAX_RGBLOCKS / sbp->sb_rextsize); return -EINVAL; } if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) { xfs_warn(mp, "Realtime group size (%u) must be at least %u rt extents.", sbp->sb_rgextents, XFS_MIN_RGEXTENTS); return -EINVAL; } if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) { xfs_warn(mp, "Realtime groups (%u) must be less than %u.", sbp->sb_rgcount, XFS_MAX_RGNUMBER); return -EINVAL; } groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents); if (groups != sbp->sb_rgcount) { xfs_warn(mp, "Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.", sbp->sb_rgcount, groups); return -EINVAL; } /* Exchange-range is required for fsr to work on realtime files */ if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) { xfs_warn(mp, "Realtime groups feature requires exchange-range support."); return -EINVAL; } rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize); if (sbp->sb_rgblklog != rgblklog) { xfs_warn(mp, "Realtime group log (%d) does not match expected value (%d).", sbp->sb_rgblklog, rgblklog); return -EINVAL; } return 0; } static int xfs_validate_sb_zoned( struct xfs_mount *mp, struct xfs_sb *sbp) { if (sbp->sb_frextents != 0) { xfs_warn(mp, "sb_frextents must be zero for zoned file systems."); return -EINVAL; } if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { xfs_warn(mp, "sb_rtstart (%lld) overlaps sb_dblocks (%lld).", sbp->sb_rtstart, sbp->sb_dblocks); return -EINVAL; } if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { xfs_warn(mp, "sb_rtreserved (%lld) larger than sb_rblocks (%lld).", sbp->sb_rtreserved, sbp->sb_rblocks); return -EINVAL; } return 0; } /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_sb *sbp) { struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; bool has_dalign; int error; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "Superblock has bad magic number 0x%x. Not an XFS filesystem?", be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { xfs_warn(mp, "Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } /* * Validate feature flags and state */ if (xfs_sb_is_v5(sbp)) { if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { xfs_notice(mp, "Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)", sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE); return -EFSCORRUPTED; } /* V5 has a separate project quota inode */ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, "Version 5 of Super block has XFS_OQUOTA bits."); return -EFSCORRUPTED; } /* * Full inode chunks must be aligned to inode chunk size when * sparse inodes are enabled to support the sparse chunk * allocation algorithm and prevent overlapping inode records. */ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) { uint32_t align; align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize >> sbp->sb_blocklog; if (sbp->sb_inoalignmt != align) { xfs_warn(mp, "Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", sbp->sb_inoalignmt, align); return -EINVAL; } if (sbp->sb_spino_align && (sbp->sb_spino_align > sbp->sb_inoalignmt || (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) { xfs_warn(mp, "Sparse inode alignment (%u) is invalid, must be integer factor of (%u).", sbp->sb_spino_align, sbp->sb_inoalignmt); return -EINVAL; } } else if (sbp->sb_spino_align) { xfs_warn(mp, "Sparse inode alignment (%u) should be zero.", sbp->sb_spino_align); return -EINVAL; } if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) { xfs_warn(mp, "Metadir superblock padding fields must be zero."); return -EINVAL; } error = xfs_validate_sb_rtgroups(mp, sbp); if (error) return error; } if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { error = xfs_validate_sb_zoned(mp, sbp); if (error) return error; } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, "Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits."); return -EFSCORRUPTED; } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return -EINVAL; } if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return -EINVAL; } /* Compute agcount for this number of dblocks and agblocks */ if (sbp->sb_agblocks) { agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem); if (rem) agcount++; } /* * More sanity checking. Most of these were stolen directly from * xfs_repair. */ if (unlikely( sbp->sb_agcount <= 0 || sbp->sb_sectsize < XFS_MIN_SECTORSIZE || sbp->sb_sectsize > XFS_MAX_SECTORSIZE || sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || sbp->sb_sectsize != (1 << sbp->sb_sectlog) || sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_blocksize != (1 << sbp->sb_blocklog) || sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1 || agcount == 0 || agcount != sbp->sb_agcount || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || sbp->sb_shared_vn != 0)) { xfs_notice(mp, "SB sanity check failed"); return -EFSCORRUPTED; } /* * Logs that are too large are not supported at all. Reject them * outright. Logs that are too small are tolerated on v4 filesystems, * but we can only check that when mounting the log. Hence we skip * those checks here. */ if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) { xfs_notice(mp, "Log size 0x%x blocks too large, maximum size is 0x%llx blocks", sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS); return -EFSCORRUPTED; } if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) { xfs_warn(mp, "log size 0x%llx bytes too large, maximum size is 0x%llx bytes", XFS_FSB_TO_B(mp, sbp->sb_logblocks), XFS_MAX_LOG_BYTES); return -EFSCORRUPTED; } /* * Do not allow filesystems with corrupted log sector or stripe units to * be mounted. We cannot safely size the iclogs or write to the log if * the log stripe unit is not valid. */ if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) { if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) { xfs_notice(mp, "log sector size in bytes/log2 (0x%x/0x%x) must match", sbp->sb_logsectsize, 1U << sbp->sb_logsectlog); return -EFSCORRUPTED; } } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) { xfs_notice(mp, "log sector size in bytes/log2 (0x%x/0x%x) are not zero", sbp->sb_logsectsize, sbp->sb_logsectlog); return -EFSCORRUPTED; } if (sbp->sb_logsunit > 1) { if (sbp->sb_logsunit % sbp->sb_blocksize) { xfs_notice(mp, "log stripe unit 0x%x bytes must be a multiple of block size", sbp->sb_logsunit); return -EFSCORRUPTED; } if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) { xfs_notice(mp, "log stripe unit 0x%x bytes over maximum size (0x%x bytes)", sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE); return -EFSCORRUPTED; } } if (!xfs_validate_rt_geometry(sbp)) { xfs_notice(mp, "realtime %sgeometry check failed", sbp->sb_rblocks ? "" : "zeroed "); return -EFSCORRUPTED; } /* * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. */ has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT; if (!!sbp->sb_unit ^ has_dalign) { xfs_notice(mp, "SB stripe alignment sanity check failed"); return -EFSCORRUPTED; } if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), XFS_FSB_TO_B(mp, sbp->sb_width), 0, xfs_buf_daddr(bp) == XFS_SB_DADDR, false)) return -EFSCORRUPTED; /* * Currently only very few inode sizes are supported. */ switch (sbp->sb_inodesize) { case 256: case 512: case 1024: case 2048: break; default: xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return -ENOSYS; } return 0; } void xfs_sb_quota_from_disk(struct xfs_sb *sbp) { if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { sbp->sb_uquotino = NULLFSINO; sbp->sb_gquotino = NULLFSINO; sbp->sb_pquotino = NULLFSINO; return; } /* * older mkfs doesn't initialize quota inodes to NULLFSINO. This * leads to in-core values having two different values for a quota * inode to be invalid: 0 and NULLFSINO. Change it to a single value * NULLFSINO. * * Note that this change affect only the in-core values. These * values are not written back to disk unless any quota information * is written to the disk. Even in that case, sb_pquotino field is * not written to disk unless the superblock supports pquotino. */ if (sbp->sb_uquotino == 0) sbp->sb_uquotino = NULLFSINO; if (sbp->sb_gquotino == 0) sbp->sb_gquotino = NULLFSINO; if (sbp->sb_pquotino == 0) sbp->sb_pquotino = NULLFSINO; /* * We need to do these manipilations only if we are working * with an older version of on-disk superblock. */ if (xfs_sb_is_v5(sbp)) return; if (sbp->sb_qflags & XFS_OQUOTA_ENFD) sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; if (sbp->sb_qflags & XFS_OQUOTA_CHKD) sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); if (sbp->sb_qflags & XFS_PQUOTA_ACCT && sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, * copy over sb_gquotino to sb_pquotino. The NULLFSINO test * above is to make sure we don't do this twice and wipe them * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; } } static void __xfs_sb_from_disk( struct xfs_sb *to, struct xfs_dsb *from, bool convert_xquota) { to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); to->sb_rblocks = be64_to_cpu(from->sb_rblocks); to->sb_rextents = be64_to_cpu(from->sb_rextents); memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); to->sb_logstart = be64_to_cpu(from->sb_logstart); to->sb_rootino = be64_to_cpu(from->sb_rootino); to->sb_rbmino = be64_to_cpu(from->sb_rbmino); to->sb_rsumino = be64_to_cpu(from->sb_rsumino); to->sb_rextsize = be32_to_cpu(from->sb_rextsize); to->sb_agblocks = be32_to_cpu(from->sb_agblocks); to->sb_agcount = be32_to_cpu(from->sb_agcount); to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); to->sb_logblocks = be32_to_cpu(from->sb_logblocks); to->sb_versionnum = be16_to_cpu(from->sb_versionnum); to->sb_sectsize = be16_to_cpu(from->sb_sectsize); to->sb_inodesize = be16_to_cpu(from->sb_inodesize); to->sb_inopblock = be16_to_cpu(from->sb_inopblock); memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); to->sb_blocklog = from->sb_blocklog; to->sb_sectlog = from->sb_sectlog; to->sb_inodelog = from->sb_inodelog; to->sb_inopblog = from->sb_inopblog; to->sb_agblklog = from->sb_agblklog; to->sb_rextslog = from->sb_rextslog; to->sb_inprogress = from->sb_inprogress; to->sb_imax_pct = from->sb_imax_pct; to->sb_icount = be64_to_cpu(from->sb_icount); to->sb_ifree = be64_to_cpu(from->sb_ifree); to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); to->sb_frextents = be64_to_cpu(from->sb_frextents); to->sb_uquotino = be64_to_cpu(from->sb_uquotino); to->sb_gquotino = be64_to_cpu(from->sb_gquotino); to->sb_qflags = be16_to_cpu(from->sb_qflags); to->sb_flags = from->sb_flags; to->sb_shared_vn = from->sb_shared_vn; to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); to->sb_unit = be32_to_cpu(from->sb_unit); to->sb_width = be32_to_cpu(from->sb_width); to->sb_dirblklog = from->sb_dirblklog; to->sb_logsectlog = from->sb_logsectlog; to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); to->sb_logsunit = be32_to_cpu(from->sb_logsunit); to->sb_features2 = be32_to_cpu(from->sb_features2); to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); to->sb_features_compat = be32_to_cpu(from->sb_features_compat); to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); to->sb_features_log_incompat = be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* * sb_meta_uuid is only on disk if it differs from sb_uuid and the * feature flag is set; if not set we keep it only in memory. */ if (xfs_sb_is_v5(to) && (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); else uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { to->sb_metadirino = be64_to_cpu(from->sb_metadirino); to->sb_rgblklog = from->sb_rgblklog; memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad)); to->sb_rgcount = be32_to_cpu(from->sb_rgcount); to->sb_rgextents = be32_to_cpu(from->sb_rgextents); to->sb_rbmino = NULLFSINO; to->sb_rsumino = NULLFSINO; } else { to->sb_metadirino = NULLFSINO; to->sb_rgcount = 1; to->sb_rgextents = 0; } if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { to->sb_rtstart = be64_to_cpu(from->sb_rtstart); to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); } else { to->sb_rtstart = 0; to->sb_rtreserved = 0; } } void xfs_sb_from_disk( struct xfs_sb *to, struct xfs_dsb *from) { __xfs_sb_from_disk(to, from, true); } static void xfs_sb_quota_to_disk( struct xfs_dsb *to, struct xfs_sb *from) { uint16_t qflags = from->sb_qflags; if (xfs_sb_is_v5(from) && (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { to->sb_qflags = cpu_to_be16(from->sb_qflags); to->sb_uquotino = cpu_to_be64(0); to->sb_gquotino = cpu_to_be64(0); to->sb_pquotino = cpu_to_be64(0); return; } to->sb_uquotino = cpu_to_be64(from->sb_uquotino); /* * The in-memory superblock quota state matches the v5 on-disk format so * just write them out and return */ if (xfs_sb_is_v5(from)) { to->sb_qflags = cpu_to_be16(from->sb_qflags); to->sb_gquotino = cpu_to_be64(from->sb_gquotino); to->sb_pquotino = cpu_to_be64(from->sb_pquotino); return; } /* * For older superblocks (v4), the in-core version of sb_qflags do not * have XFS_OQUOTA_* flags, whereas the on-disk version does. So, * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. */ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); if (from->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) qflags |= XFS_OQUOTA_ENFD; if (from->sb_qflags & (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) qflags |= XFS_OQUOTA_CHKD; to->sb_qflags = cpu_to_be16(qflags); /* * GQUOTINO and PQUOTINO cannot be used together in versions * of superblock that do not have pquotino. from->sb_flags * tells us which quota is active and should be copied to * disk. If neither are active, we should NULL the inode. * * In all cases, the separate pquotino must remain 0 because it * is beyond the "end" of the valid non-pquotino superblock. */ if (from->sb_qflags & XFS_GQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_gquotino); else if (from->sb_qflags & XFS_PQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); else { /* * We can't rely on just the fields being logged to tell us * that it is safe to write NULLFSINO - we should only do that * if quotas are not actually enabled. Hence only write * NULLFSINO if both in-core quota inodes are NULL. */ if (from->sb_gquotino == NULLFSINO && from->sb_pquotino == NULLFSINO) to->sb_gquotino = cpu_to_be64(NULLFSINO); } to->sb_pquotino = 0; } void xfs_sb_to_disk( struct xfs_dsb *to, struct xfs_sb *from) { xfs_sb_quota_to_disk(to, from); to->sb_magicnum = cpu_to_be32(from->sb_magicnum); to->sb_blocksize = cpu_to_be32(from->sb_blocksize); to->sb_dblocks = cpu_to_be64(from->sb_dblocks); to->sb_rblocks = cpu_to_be64(from->sb_rblocks); to->sb_rextents = cpu_to_be64(from->sb_rextents); memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); to->sb_logstart = cpu_to_be64(from->sb_logstart); to->sb_rootino = cpu_to_be64(from->sb_rootino); to->sb_rbmino = cpu_to_be64(from->sb_rbmino); to->sb_rsumino = cpu_to_be64(from->sb_rsumino); to->sb_rextsize = cpu_to_be32(from->sb_rextsize); to->sb_agblocks = cpu_to_be32(from->sb_agblocks); to->sb_agcount = cpu_to_be32(from->sb_agcount); to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks); to->sb_logblocks = cpu_to_be32(from->sb_logblocks); to->sb_versionnum = cpu_to_be16(from->sb_versionnum); to->sb_sectsize = cpu_to_be16(from->sb_sectsize); to->sb_inodesize = cpu_to_be16(from->sb_inodesize); to->sb_inopblock = cpu_to_be16(from->sb_inopblock); memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); to->sb_blocklog = from->sb_blocklog; to->sb_sectlog = from->sb_sectlog; to->sb_inodelog = from->sb_inodelog; to->sb_inopblog = from->sb_inopblog; to->sb_agblklog = from->sb_agblklog; to->sb_rextslog = from->sb_rextslog; to->sb_inprogress = from->sb_inprogress; to->sb_imax_pct = from->sb_imax_pct; to->sb_icount = cpu_to_be64(from->sb_icount); to->sb_ifree = cpu_to_be64(from->sb_ifree); to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks); to->sb_frextents = cpu_to_be64(from->sb_frextents); to->sb_flags = from->sb_flags; to->sb_shared_vn = from->sb_shared_vn; to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt); to->sb_unit = cpu_to_be32(from->sb_unit); to->sb_width = cpu_to_be32(from->sb_width); to->sb_dirblklog = from->sb_dirblklog; to->sb_logsectlog = from->sb_logsectlog; to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize); to->sb_logsunit = cpu_to_be32(from->sb_logsunit); /* * We need to ensure that bad_features2 always matches features2. * Hence we enforce that here rather than having to remember to do it * everywhere else that updates features2. */ from->sb_bad_features2 = from->sb_features2; to->sb_features2 = cpu_to_be32(from->sb_features2); to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); if (!xfs_sb_is_v5(from)) return; to->sb_features_compat = cpu_to_be32(from->sb_features_compat); to->sb_features_ro_compat = cpu_to_be32(from->sb_features_ro_compat); to->sb_features_incompat = cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { to->sb_metadirino = cpu_to_be64(from->sb_metadirino); to->sb_rgblklog = from->sb_rgblklog; memset(to->sb_pad, 0, sizeof(to->sb_pad)); to->sb_rgcount = cpu_to_be32(from->sb_rgcount); to->sb_rgextents = cpu_to_be32(from->sb_rgextents); to->sb_rbmino = cpu_to_be64(0); to->sb_rsumino = cpu_to_be64(0); } if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { to->sb_rtstart = cpu_to_be64(from->sb_rtstart); to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); } } /* * If the superblock has the CRC feature bit set or the CRC field is non-null, * check that the CRC is valid. We check the CRC field is non-null because a * single bit error could clear the feature bit and unused parts of the * superblock are supposed to be zero. Hence a non-null crc field indicates that * we've potentially lost a feature bit and we should check it anyway. * * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the * last field in V4 secondary superblocks. So for secondary superblocks, * we are more forgiving, and ignore CRC failures if the primary doesn't * indicate that the fs version is V5. */ static void xfs_sb_read_verify( struct xfs_buf *bp) { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_dsb *dsb = bp->b_addr; int error; /* * open code the version check to avoid needing to convert the entire * superblock from disk order just to check the version number */ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == XFS_SB_VERSION_5) || dsb->sb_crc != 0)) { if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ if (xfs_buf_daddr(bp) == XFS_SB_DADDR || xfs_has_crc(mp)) { error = -EFSBADCRC; goto out_error; } } } /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; error = xfs_validate_sb_read(mp, &sb); out_error: if (error == -EFSCORRUPTED || error == -EFSBADCRC) xfs_verifier_error(bp, error, __this_address); else if (error) xfs_buf_ioerror(bp, error); } /* * We may be probed for a filesystem match, so we may not want to emit * messages when the superblock buffer is not actually an XFS superblock. * If we find an XFS superblock, then run a normal, noisy mount because we are * really going to mount it and want to know about errors. */ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); return; } /* quietly fail */ xfs_buf_ioerror(bp, -EWRONGFS); } static void xfs_sb_write_verify( struct xfs_buf *bp) { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; error = xfs_validate_sb_write(mp, bp, &sb); if (error) goto out_error; if (!xfs_sb_is_v5(&sb)) return; if (bip) dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; out_error: xfs_verifier_error(bp, error, __this_address); } const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; /* Compute cached rt geometry from the incore sb. */ void xfs_sb_mount_rextsize( struct xfs_mount *mp, struct xfs_sb *sbp) { struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; rgs->blklog = mp->m_sb.sb_rgblklog; rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); rgs->start_fsb = mp->m_sb.sb_rtstart; if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) rgs->has_daddr_gaps = true; } else { rgs->blocks = 0; rgs->blklog = 0; rgs->blkmask = (uint64_t)-1; } } /* Update incore sb rt extent size, then recompute the cached rt geometry. */ void xfs_mount_sb_set_rextsize( struct xfs_mount *mp, struct xfs_sb *sbp, xfs_agblock_t rextsize) { sbp->sb_rextsize = rextsize; if (xfs_sb_is_v5(sbp) && (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, rextsize); xfs_sb_mount_rextsize(mp, sbp); } /* * xfs_mount_common * * Mount initialization code establishing various mount * fields from the superblock associated with the given * mount structure. * * Inode geometry are calculated in xfs_ialloc_setup_geometry. */ void xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; mp->m_agfrotor = 0; atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG; mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG; ags->blocks = mp->m_sb.sb_agblocks; ags->blklog = mp->m_sb.sb_agblklog; ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog); xfs_sb_mount_rextsize(mp, sbp); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2; mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2; mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2; mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); } /* * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock * into the superblock buffer to be logged. It does not provide the higher * level of locking that is needed to protect the in-core superblock from * concurrent access. */ void xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp); /* * Lazy sb counters don't update the in-core superblock so do that now. * If this is at unmount, the counters will be exactly correct, but at * any other time they will only be ballpark correct because of * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); } /* * sb_frextents was added to the lazy sb counters when the rt groups * feature was introduced. This counter can go negative due to the way * we handle nearly-lockless reservations, so we must use the _positive * variant here to avoid writing out nonsense frextents. * * RT groups are only supported on v5 file systems, which always * have lazy SB counters. */ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { mp->m_sb.sb_frextents = xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* * xfs_sync_sb * * Sync the superblock to disk. * * Note that the caller is responsible for checking the frozen state of the * filesystem. This procedure uses the non-blocking transaction allocator and * thus will allow modifications to a frozen fs. This is required because this * code can be called during the process of freezing where use of the high-level * allocator would deadlock. */ int xfs_sync_sb( struct xfs_mount *mp, bool wait) { struct xfs_trans *tp; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, XFS_TRANS_NO_WRITECOUNT, &tp); if (error) return error; xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } /* * Update all the secondary superblocks to match the new state of the primary. * Because we are completely overwriting all the existing fields in the * secondary superblock buffers, there is no need to read them in from disk. * Just get a new buffer, stamp it and write it. * * The sb buffers need to be cached here so that we serialise against other * operations that access the secondary superblocks, but we don't want to keep * them in memory once it is written so we mark it as a one-shot buffer. */ int xfs_update_secondary_sbs( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ while ((pag = xfs_perag_next_from(mp, pag, 1))) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, * continue. xfs_repair chooses the "best" superblock based * on most matches; if we break early, we'll leave more * superblocks un-updated than updated, and xfs_repair may * pick them over the properly-updated primary. */ if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", pag_agno(pag)); if (!saved_error) saved_error = error; continue; } bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); /* don't hold too many buffers at once */ if (pag_agno(pag) % 16) continue; error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", error, pag_agno(pag)); if (!saved_error) saved_error = error; continue; } } error = xfs_buf_delwri_submit(&buffer_list); if (error) xfs_warn(mp, "error %d writing secondary superblocks", error); return saved_error ? saved_error : error; } /* * Same behavior as xfs_sync_sb, except that it is always synchronous and it * also writes the superblock buffer to disk sector 0 immediately. */ int xfs_sync_sb_buf( struct xfs_mount *mp, bool update_rtsb) { struct xfs_trans *tp; struct xfs_buf *bp; struct xfs_buf *rtsb_bp = NULL; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); if (error) return error; bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); if (update_rtsb) { rtsb_bp = xfs_log_rtsb(tp, bp); if (rtsb_bp) xfs_trans_bhold(tp, rtsb_bp); } xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) goto out; /* * write out the sb buffer to get the changes to disk */ error = xfs_bwrite(bp); if (!error && rtsb_bp) error = xfs_bwrite(rtsb_bp); out: if (rtsb_bp) xfs_buf_relse(rtsb_bp); xfs_buf_relse(bp); return error; } void xfs_fs_geometry( struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version) { struct xfs_sb *sbp = &mp->m_sb; memset(geo, 0, sizeof(struct xfs_fsop_geom)); geo->blocksize = sbp->sb_blocksize; geo->rtextsize = sbp->sb_rextsize; geo->agblocks = sbp->sb_agblocks; geo->agcount = sbp->sb_agcount; geo->logblocks = sbp->sb_logblocks; geo->sectsize = sbp->sb_sectsize; geo->inodesize = sbp->sb_inodesize; geo->imaxpct = sbp->sb_imax_pct; geo->datablocks = sbp->sb_dblocks; geo->rtblocks = sbp->sb_rblocks; geo->rtextents = sbp->sb_rextents; geo->logstart = sbp->sb_logstart; BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid)); memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid)); if (struct_version < 2) return; geo->sunit = sbp->sb_unit; geo->swidth = sbp->sb_width; if (struct_version < 3) return; geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | XFS_FSOP_GEOM_FLAGS_EXTFLG | XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; if (xfs_has_align(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; if (xfs_has_dalign(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; if (xfs_has_asciici(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; if (xfs_has_ftype(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; if (xfs_has_finobt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; if (xfs_has_sparseinodes(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; if (xfs_has_rmapbt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; if (xfs_has_reflink(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; if (xfs_has_bigtime(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; if (xfs_has_parent(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PARENT; if (xfs_has_sector(mp)) { geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; } else { geo->logsectsize = BBSIZE; } if (xfs_has_large_extent_counts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; if (xfs_has_exchange_range(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; if (xfs_has_metadir(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; if (xfs_has_zoned(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) return; if (xfs_has_logv2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; if (struct_version < 5) return; geo->version = XFS_FSOP_GEOM_VERSION_V5; if (xfs_has_rtgroups(mp)) { geo->rgcount = sbp->sb_rgcount; geo->rgextents = sbp->sb_rgextents; } if (xfs_has_zoned(mp)) { geo->rtstart = sbp->sb_rtstart; geo->rtreserved = sbp->sb_rtreserved; } } /* Read a secondary superblock. */ int xfs_sb_read_secondary( struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp) { struct xfs_buf *bp; int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB); if (error) return error; xfs_buf_set_ref(bp, XFS_SSB_REF); *bpp = bp; return 0; } /* Get an uninitialised secondary superblock buffer. */ int xfs_sb_get_secondary( struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp) { struct xfs_buf *bp; int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) return error; bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); *bpp = bp; return 0; } /* * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users * won't be confused by values in error messages. This function returns false * if the stripe geometry is invalid and the caller is unable to repair the * stripe configuration later in the mount process. */ bool xfs_validate_stripe_geometry( struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, bool silent) { if (swidth > INT_MAX) { if (!silent) xfs_notice(mp, "stripe width (%lld) is too large", swidth); goto check_override; } if (sunit > swidth) { if (!silent) xfs_notice(mp, "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); goto check_override; } if (sectorsize && (int)sunit % sectorsize) { if (!silent) xfs_notice(mp, "stripe unit (%lld) must be a multiple of the sector size (%d)", sunit, sectorsize); goto check_override; } if (sunit && !swidth) { if (!silent) xfs_notice(mp, "invalid stripe unit (%lld) and stripe width of 0", sunit); goto check_override; } if (!sunit && swidth) { if (!silent) xfs_notice(mp, "invalid stripe width (%lld) and stripe unit of 0", swidth); goto check_override; } if (sunit && (int)swidth % (int)sunit) { if (!silent) xfs_notice(mp, "stripe width (%lld) must be a multiple of the stripe unit (%lld)", swidth, sunit); goto check_override; } return true; check_override: if (!may_repair) return false; /* * During mount, mp->m_dalign will not be set unless the sunit mount * option was set. If it was set, ignore the bad stripe alignment values * and allow the validation and overwrite later in the mount process to * attempt to overwrite the bad stripe alignment values with the values * supplied by mount options. */ if (!mp->m_dalign) return false; if (!silent) xfs_notice(mp, "Will try to correct with specified mount options sunit (%d) and swidth (%d)", BBTOB(mp->m_dalign), BBTOB(mp->m_swidth)); return true; } /* * Compute the maximum level number of the realtime summary file, as defined by * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct * use of rt volumes with more than 2^32 extents. */ uint8_t xfs_compute_rextslog( xfs_rtbxlen_t rtextents) { if (!rtextents) return 0; return xfs_highbit64(rtextents); }
105 273 2 266 366 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_CMND_H #define _SCSI_SCSI_CMND_H #include <linux/dma-mapping.h> #include <linux/blkdev.h> #include <linux/t10-pi.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/scatterlist.h> #include <scsi/scsi_device.h> struct Scsi_Host; /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. * fixed-length means: commands that their size can be determined * by their opcode and the CDB does not carry a length specifier, (unlike * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly * true and the SCSI standard also defines extended commands and * vendor specific commands that can be bigger than 16 bytes. The kernel * will support these using the same infrastructure used for VARLEN CDB's. * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml * supports without specifying a cmd_len by ULD's */ #define MAX_COMMAND_SIZE 16 struct scsi_data_buffer { struct sg_table table; unsigned length; }; /* embedded in scsi_cmnd */ struct scsi_pointer { char *ptr; /* data pointer */ int this_residual; /* left in this buffer */ struct scatterlist *buffer; /* which buffer */ int buffers_residual; /* how many buffers left */ dma_addr_t dma_handle; volatile int Status; volatile int Message; volatile int have_data_in; volatile int sent_command; volatile int phase; }; /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) #define SCMD_LAST (1 << 2) /* * libata uses SCSI EH to fetch sense data for successful commands. * SCSI EH should not overwrite scmd->result when SCMD_FORCE_EH_SUCCESS is set. */ #define SCMD_FORCE_EH_SUCCESS (1 << 3) #define SCMD_FAIL_IF_RECOVERING (1 << 4) /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING) /* for scmd->state */ #define SCMD_STATE_COMPLETE 0 #define SCMD_STATE_INFLIGHT 1 enum scsi_cmnd_submitter { SUBMITTED_BY_BLOCK_LAYER = 0, SUBMITTED_BY_SCSI_ERROR_HANDLER = 1, SUBMITTED_BY_SCSI_RESET_IOCTL = 2, } __packed; struct scsi_cmnd { struct scsi_device *device; struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */ struct delayed_work abort_work; struct rcu_head rcu; int eh_eflags; /* Used by error handlr */ int budget_token; /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has * been outstanding */ unsigned long jiffies_at_alloc; int retries; int allowed; unsigned char prot_op; unsigned char prot_type; unsigned char prot_flags; enum scsi_cmnd_submitter submitter; unsigned short cmd_len; enum dma_data_direction sc_data_direction; unsigned char cmnd[32]; /* SCSI CDB */ /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; struct scsi_data_buffer *prot_sdb; unsigned underflow; /* Return error if less than this amount is transferred */ unsigned transfersize; /* How much we are guaranteed to transfer with each SCSI transfer (ie, between disconnect / reconnects. Probably == sector size */ unsigned resid_len; /* residual count */ unsigned sense_len; unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original * command (auto-sense). Length must be * SCSI_SENSE_BUFFERSIZE bytes. */ int flags; /* Command flags */ unsigned long state; /* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ /* * The fields below can be modified by the LLD but the fields above * must not be modified. */ unsigned char *host_scribble; /* The host adapter is allowed to * call scsi_malloc and get some memory * and hang it here. The host adapter * is also expected to call scsi_free * to release this memory. (The memory * obtained by scsi_malloc is guaranteed * to be at an address < 16Mb). */ int result; /* Status code from lower level driver */ }; /* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */ static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd) { return blk_mq_rq_from_pdu(scmd); } /* * Return the driver private allocation behind the command. * Only works if cmd_size is set in the host template. */ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) { return cmd + 1; } void scsi_done(struct scsi_cmnd *cmd); void scsi_done_direct(struct scsi_cmnd *cmd); extern void scsi_finish_command(struct scsi_cmnd *cmd); extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd); void scsi_free_sgtables(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); #else /* !CONFIG_SCSI_DMA */ static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; } static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { } #endif /* !CONFIG_SCSI_DMA */ static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) { return cmd->sdb.table.nents; } static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) { return cmd->sdb.table.sgl; } static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) { return cmd->sdb.length; } static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid) { cmd->resid_len = resid; } static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd) { return cmd->resid_len; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd, const void *buf, int buflen) { return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd, void *buf, int buflen) { return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd) { return blk_rq_pos(scsi_cmd_to_rq(scmd)); } static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift; } static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size); return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } /* * The operations below are hints that tell the controller driver how * to handle I/Os with DIF or similar types of protection information. */ enum scsi_prot_operations { /* Normal I/O */ SCSI_PROT_NORMAL = 0, /* OS-HBA: Protected, HBA-Target: Unprotected */ SCSI_PROT_READ_INSERT, SCSI_PROT_WRITE_STRIP, /* OS-HBA: Unprotected, HBA-Target: Protected */ SCSI_PROT_READ_STRIP, SCSI_PROT_WRITE_INSERT, /* OS-HBA: Protected, HBA-Target: Protected */ SCSI_PROT_READ_PASS, SCSI_PROT_WRITE_PASS, }; static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op) { scmd->prot_op = op; } static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd) { return scmd->prot_op; } enum scsi_prot_flags { SCSI_PROT_TRANSFER_PI = 1 << 0, SCSI_PROT_GUARD_CHECK = 1 << 1, SCSI_PROT_REF_CHECK = 1 << 2, SCSI_PROT_REF_INCREMENT = 1 << 3, SCSI_PROT_IP_CHECKSUM = 1 << 4, }; /* * The controller usually does not know anything about the target it * is communicating with. However, when DIX is enabled the controller * must be know target type so it can verify the protection * information passed along with the I/O. */ enum scsi_prot_target_type { SCSI_PROT_DIF_TYPE0 = 0, SCSI_PROT_DIF_TYPE1, SCSI_PROT_DIF_TYPE2, SCSI_PROT_DIF_TYPE3, }; static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type) { scmd->prot_type = type; } static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd) { return scmd->prot_type; } static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) { struct request *rq = blk_mq_rq_from_pdu(scmd); return t10_pi_ref_tag(rq); } static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) { return scmd->device->sector_size; } static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; } static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL; } static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd) { return cmd->prot_sdb; } #define scsi_for_each_prot_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i) static inline void set_status_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xffffff00) | status; } static inline u8 get_status_byte(struct scsi_cmnd *cmd) { return cmd->result & 0xff; } static inline void set_host_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xff00ffff) | (status << 16); } static inline u8 get_host_byte(struct scsi_cmnd *cmd) { return (cmd->result >> 16) & 0xff; } /** * scsi_msg_to_host_byte() - translate message byte * @cmd: the SCSI command * @msg: the SCSI parallel message byte to translate * * Translate the SCSI parallel message byte to a matching * host byte setting. A message of COMMAND_COMPLETE indicates * a successful command execution, any other message indicate * an error. As the messages themselves only have a meaning * for the SCSI parallel protocol this function translates * them into a matching host byte value for SCSI EH. */ static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg) { switch (msg) { case COMMAND_COMPLETE: break; case ABORT_TASK_SET: set_host_byte(cmd, DID_ABORT); break; case TARGET_RESET: set_host_byte(cmd, DID_RESET); break; default: set_host_byte(cmd, DID_ERROR); break; } } static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) { unsigned int xfer_len = scmd->sdb.length; unsigned int prot_interval = scsi_prot_interval(scmd); if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; return xfer_len; } extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq); struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); #endif /* _SCSI_SCSI_CMND_H */
1 181 124 122 6 65 128 110 33 127 8 128 128 124 6 128 122 7 128 3 128 32 32 32 126 32 126 2 17 126 71 39 39 39 32 32 3 3 3 31 72 72 72 46 46 109 31 126 1 126 125 109 109 32 32 32 32 32 3 3 3 32 32 32 32 109 105 105 109 126 125 126 125 126 126 126 126 32 126 71 49 39 40 39 49 49 72 72 72 39 39 39 72 72 54 54 54 26 26 2 1 26 26 26 53 44 3 3 3 3 3 3 3 44 1 4 5 1 5 1 5 4 4 4 1 5 4 4 4 4 1 4 1 74 70 74 5 45 3 3 3 45 2 2 2 2 2 2 109 110 110 110 110 110 110 109 109 105 109 52 52 52 49 49 49 30 30 176 180 1 8 8 8 8 7 8 1 9 7 9 1 8 8 7 7 7 1 266 3 3 2 2 2 1 1 1 1 2 1 1 1 1 1 10 9 10 5 2 4 2 5 11 17 17 6 6 6 6 6 4 4 4 6 18 6 12 12 12 12 12 3 10 10 1 10 12 7 6 7 7 7 1 7 12 12 18 96 96 96 96 96 21 21 21 21 21 21 21 21 21 32 33 33 1 1 1 1 33 32 32 32 16 16 13 16 28 28 19 19 19 19 7 11 3 5 5 5 4 4 3 4 4 4 4 4 1 6 6 3 3 3 3 6 2 2 2 1 1 6 3 5 6 6 3 2 62 2 2 265 95 30 37 123 228 99 265 2 13 5 2 3 3 113 114 2 2 2 2 2 2 2 162 165 165 164 128 164 129 165 164 1 165 133 133 165 160 160 160 62 62 117 117 160 160 159 159 128 128 124 160 160 158 1 1 6 6 4 6 4 6 3 3 3 6 6 6 6 6 6 6 1 1 1 1 1 6 1 5 5 6 5 6 9 4 9 7 3 7 7 3 2 4 4 4 4 4 4 9 3 7 7 2 2 2 9 9 2 6 1 6 5 6 5 4 4 3 4 1 4 4 4 19 19 17 16 2 15 14 9 9 6 10 19 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_tunnel.h" static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; u16 vid = *(u16 *)arg->key; return vle->vid != vid; } static const struct rhashtable_params br_vlan_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, vnode), .key_offset = offsetof(struct net_bridge_vlan, vid), .key_len = sizeof(u16), .nelem_hint = 3, .max_size = VLAN_N_VID, .obj_cmpfn = br_vlan_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) { return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; } static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) return; smp_wmb(); vg->pvid = 0; } /* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. */ static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, bool commit) { struct net_bridge_vlan_group *vg; bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); /* check if anything would be changed on commit */ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); if (!commit) goto out; if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(vg, v); else __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; out: return change; } static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) { return __vlan_flags_update(v, flags, false); } static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) { __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ err = br_switchdev_port_vlan_no_foreign_add(dev, v->vid, flags, false, extack); if (err != -EOPNOTSUPP) { v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV | BR_VLFLAG_TAGGING_BY_SWITCHDEV; return err; } err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } static void __vlan_add_list(struct net_bridge_vlan *v) { struct net_bridge_vlan_group *vg; struct list_head *headp, *hpos; struct net_bridge_vlan *vent; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); } static void __vlan_del_list(struct net_bridge_vlan *v) { list_del_rcu(&v->vlist); } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ err = br_switchdev_port_vlan_del(dev, v->vid); if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) vlan_vid_del(dev, br->vlan_proto, v->vid); return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases * a reference is taken to the master vlan before returning. */ static struct net_bridge_vlan * br_vlan_get_master(struct net_bridge *br, u16 vid, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; vg = br_vlan_group(br); masterv = br_vlan_find(vg, vid); if (!masterv) { bool changed; /* missing global ctx, create it now */ if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; refcount_set(&masterv->refcnt, 1); return masterv; } refcount_inc(&masterv->refcnt); return masterv; } static void br_master_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(!br_vlan_is_master(v)); free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; if (!br_vlan_is_master(masterv)) return; vg = br_vlan_group(masterv->br); if (refcount_dec_and_test(&masterv->refcnt)) { rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); br_multicast_toggle_one_vlan(masterv, false); br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } static void nbp_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_init_state(struct net_bridge_vlan *v) { struct net_bridge *br; if (br_vlan_is_master(v)) br = v->br; else br = v->port->br; if (br_opt_get(br, BROPT_MST_ENABLED)) { br_mst_vlan_init_state(v); return; } v->state = BR_STATE_FORWARDING; v->msti = 0; } /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: * 1. vlan is being added on a port (no master flags, global entry exists) * 2. vlan is being added on a bridge (both master and brentry flags) * 3. vlan is being added on a port, but a global entry didn't exist which * is being created right now (master flag set, brentry flag unset), the * global entry is used for global per-vlan features, but not for filtering * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_device *dev; struct net_bridge *br; int err; if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; vg = br_vlan_group(br); } else { p = v->port; br = p->br; dev = p->dev; vg = nbp_vlan_group(p); } if (p) { /* Add VLAN to the device filter if it is supported. * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { bool changed; err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, &changed, extack); if (err) goto out_filt; if (changed) br_vlan_notify(br, NULL, v->vid, 0, RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) { err = -ENOMEM; goto out_filt; } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { v->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; } v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { if (br_vlan_should_use(v)) { err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err && err != -EOPNOTSUPP) goto out; } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) { err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; } } vg->num_vlans++; } /* set the state before publishing */ br_vlan_init_state(v); err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) goto out_fdb_insert; __vlan_add_list(v); __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; out_fdb_insert: if (br_vlan_should_use(v)) { br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); vg->num_vlans--; } out_filt: if (p) { __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); v->stats = NULL; br_vlan_put_master(masterv); v->brvlan = NULL; } } else { br_switchdev_port_vlan_del(dev, v->vid); } goto out; } static int __vlan_del(struct net_bridge_vlan *v) { struct net_bridge_vlan *masterv = v; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0; if (br_vlan_is_master(v)) { vg = br_vlan_group(v->br); } else { p = v->port; vg = nbp_vlan_group(v->port); masterv = v->brvlan; } __vlan_delete_pvid(vg, v->vid); if (p) { err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { err = br_switchdev_port_vlan_del(v->br->dev, v->vid); if (err && err != -EOPNOTSUPP) goto out; err = 0; } if (br_vlan_should_use(v)) { v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans--; } if (masterv != v) { vlan_tunnel_info_del(vg, v); rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); br_multicast_toggle_one_vlan(v, false); br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); out: return err; } static void __vlan_group_free(struct net_bridge_vlan_group *vg) { WARN_ON(!list_empty(&vg->vlan_list)); rhashtable_destroy(&vg->vlan_hash); vlan_tunnel_deinit(vg); kfree(vg); } static void __vlan_flush(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { /* take care of disjoint ranges */ if (!v_start) { v_start = vlan->vid; } else if (vlan->vid - v_end != 1) { /* found range end, notify and start next one */ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); v_start = vlan->vid; } v_end = vlan->vid; err = __vlan_del(vlan); if (err) { br_err(br, "port %u(%s) failed to delete vlan %d: %pe\n", (unsigned int) p->port_no, p->dev->name, vlan->vid, ERR_PTR(err)); } } /* notify about the last/whole vlan range */ if (v_start) br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id has untagged flag set, * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { kfree_skb(skb); return NULL; } } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->tx_bytes, skb->len); u64_stats_inc(&stats->tx_packets); u64_stats_update_end(&stats->syncp); } /* If the skb will be sent using forwarding offload, the assumption is * that the switchdev will inject the packet into hardware together * with the bridge VLAN, so that it can be forwarded according to that * VLAN. The switchdev should deal with popping the VLAN header in * hardware on each egress port as appropriate. So only strip the VLAN * header if forwarding offload is not being used. */ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { kfree_skb(skb); return NULL; } out: return skb; } /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (unlikely(!skb)) return false; skb_pull(skb, ETH_HLEN); skb_reset_mac_len(skb); *vid = 0; tagged = false; } else { tagged = true; } } else { /* Untagged frame */ tagged = false; } if (!*vid) { u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. */ *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, we know that skb->vlan_tci VID * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; /* if snooping and stats are disabled we can avoid the lookup */ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); if (!br_vlan_state_allowed(*state, true)) goto drop; } return true; } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_state(v); if (!br_vlan_state_allowed(*state, true)) goto drop; } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb->len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } *vlan = v; return true; drop: kfree_skb(skb); return false; } bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); if (v && br_vlan_should_use(v) && br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; } /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return true; vg = nbp_vlan_group_rcu(p); if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { *vid = br_get_pvid(vg); if (!*vid || !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } v = br_vlan_find(vg, *vid); if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; } static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u16 flags, bool *changed, struct netlink_ext_ack *extack) { bool becomes_brentry = false; bool would_change = false; int err; if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) return -EINVAL; becomes_brentry = true; } else { would_change = __vlan_flags_would_change(vlan, flags); } /* Master VLANs that aren't brentries weren't notified before, * time to notify them now. */ if (becomes_brentry || would_change) { err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, would_change, extack); if (err && err != -EOPNOTSUPP) return err; } if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { br_err(br, "failed to insert local address into bridge forwarding table\n"); goto err_fdb_insert; } refcount_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; br_multicast_toggle_one_vlan(vlan, true); } __vlan_flags_commit(vlan, flags); if (would_change) *changed = true; return 0; err_fdb_insert: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) return br_vlan_add_existing(br, vg, vlan, flags, changed, extack); vlan = kzalloc_obj(*vlan); if (!vlan) return -ENOMEM; vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); } else { *changed = true; } return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int br_vlan_delete(struct net_bridge *br, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = br_vlan_group(br); v = br_vlan_find(vg, vid); if (!v || !br_vlan_is_brentry(v)) return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); br_fdb_delete_by_port(br, NULL, vid, 0); vlan_tunnel_info_del(vg, v); return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { if (!vg) return NULL; return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ static void recalculate_group_addr(struct net_bridge *br) { if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) return; spin_lock_bh(&br->lock); if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) { /* Bridge Group Address */ br->group_addr[5] = 0x00; } else { /* vlan_enabled && ETH_P_8021AD */ /* Provider Bridge Group Address */ br->group_addr[5] = 0x08; } spin_unlock_bh(&br->lock); } /* Must be protected by RTNL. */ void br_recalculate_fwd_mask(struct net_bridge *br) { if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; else /* vlan_enabled && ETH_P_8021AD */ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ~(1u << br->group_addr[5]); } int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = val, }; int err; if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) { br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; } br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); br_multicast_toggle_vlan_snooping(br, false, NULL); } return 0; } bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return br_opt_get(br, BROPT_VLAN_ENABLED); } EXPORT_SYMBOL_GPL(br_vlan_enabled); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) { struct net_bridge *br = netdev_priv(dev); *p_proto = ntohs(br->vlan_proto); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_proto); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_protocol = ntohs(proto), }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } } br->vlan_proto = proto; recalculate_group_addr(br); br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, oldproto, vlan->vid); } } return 0; err_filt: attr.u.vlan_protocol = ntohs(oldproto); switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } list_for_each_entry_continue_reverse(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } } return err; } int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) { switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); break; default: return -EINVAL; } return 0; } int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) { struct net_bridge_port *p; /* allow to change the option if there are no port vlans configured */ list_for_each_entry(p, &br->port_list, list) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); if (vg->num_vlans) return -EBUSY; } switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); break; default: return -EINVAL; } return 0; } static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; if (vid != vg->pvid) return false; v = br_vlan_lookup(&vg->vlan_hash, vid); if (v && br_vlan_should_use(v) && (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return true; return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) { struct net_bridge_port *p; u16 pvid = br->default_pvid; /* Disable default_pvid on all ports where it is still * configured. */ if (vlan_default_pvid(br_vlan_group(br), pvid)) { if (!br_vlan_delete(br, pvid)) br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } list_for_each_entry(p, &br->port_list, list) { if (vlan_default_pvid(nbp_vlan_group(p), pvid) && !nbp_vlan_delete(p, pvid)) br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; } int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; unsigned long *changed; bool vlchange; u16 old_pvid; int err = 0; if (!pvid) { br_vlan_disable_default_pvid(br); return 0; } changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!changed) return -ENOMEM; old_pvid = br->default_pvid; /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = br_vlan_group(br); pvent = br_vlan_find(vg, pvid); if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, extack); if (err) goto out; if (br_vlan_delete(br, old_pvid)) br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); __set_bit(0, changed); } list_for_each_entry(p, &br->port_list, list) { /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = nbp_vlan_group(p); if ((old_pvid && !vlan_default_pvid(vg, old_pvid)) || br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, extack); if (err) goto err_port; if (nbp_vlan_delete(p, old_pvid)) br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); __set_bit(p->port_no, changed); } br->default_pvid = pvid; out: bitmap_free(changed); return err; err_port: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!test_bit(p->port_no, changed)) continue; if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); } nbp_vlan_delete(p, pvid); br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); } br_vlan_delete(br, pvid); br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; if (val >= VLAN_VID_MASK) return -EINVAL; if (pvid == br->default_pvid) goto out; /* Only allow default pvid change when filtering is disabled */ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; goto out; } err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } int br_vlan_init(struct net_bridge *br) { struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc_obj(*vg); if (!vg) goto out; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; rcu_assign_pointer(br->vlgrp, vg); out: return ret; err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: kfree(vg); goto out; } int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc_obj(struct net_bridge_vlan_group); if (!vg) goto out; ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { bool changed; ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &changed, extack); if (ret) goto err_vlan_add; br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: err_vlan_enabled: kfree(vg); goto out; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { bool would_change = __vlan_flags_would_change(vlan, flags); if (would_change) { /* Pass the flags to the hardware bridge */ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, true, extack); if (ret && ret != -EOPNOTSUPP) return ret; } __vlan_flags_commit(vlan, flags); *changed = would_change; return 0; } vlan = kzalloc_obj(*vlan); if (!vlan) return -ENOMEM; vlan->vid = vid; vlan->port = port; ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else *changed = true; return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan *v; ASSERT_RTNL(); v = br_vlan_find(nbp_vlan_group(port), vid); if (!v) return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); u64_stats_add(&stats->tx_bytes, txbytes); u64_stats_add(&stats->tx_packets, txpackets); } } int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid); int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path) { struct net_bridge_vlan_group *vg; int idx = ctx->num_vlans - 1; u16 vid; path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return; vg = br_vlan_group_rcu(br); if (idx >= 0 && ctx->vlan[idx].proto == br->vlan_proto) { vid = ctx->vlan[idx].id; } else { path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; vid = br_get_pvid(vg); } path->bridge.vlan_id = vid; path->bridge.vlan_proto = br->vlan_proto; } int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return 0; vg = nbp_vlan_group_rcu(dst); v = br_vlan_find(vg, path->bridge.vlan_id); if (!v || !br_vlan_should_use(v)) return -EINVAL; if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return 0; if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; else if (v->priv_flags & BR_VLFLAG_TAGGING_BY_SWITCHDEV) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; else path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; return 0; } int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); } static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, __always_unused struct netdev_nested_priv *priv) { return br_vlan_is_bind_vlan_dev(dev); } static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) { int found; rcu_read_lock(); found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, NULL); rcu_read_unlock(); return !!found; } struct br_vlan_bind_walk_data { u16 vid; struct net_device *result; }; static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, struct netdev_nested_priv *priv) { struct br_vlan_bind_walk_data *data = priv->data; int found = 0; if (br_vlan_is_bind_vlan_dev(dev) && vlan_dev_priv(dev)->vlan_id == data->vid) { data->result = dev; found = 1; } return found; } static struct net_device * br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) { struct br_vlan_bind_walk_data data = { .vid = vid, }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, &priv); rcu_read_unlock(); return data.result; } static bool br_vlan_is_dev_up(const struct net_device *dev) { return !!(dev->flags & IFF_UP) && netif_oper_up(dev); } static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, struct net_device *vlan_dev) { u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; bool has_carrier = false; if (!netif_carrier_ok(br->dev)) { netif_carrier_off(vlan_dev); return; } list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { has_carrier = true; break; } } if (has_carrier) netif_carrier_on(vlan_dev); else netif_carrier_off(vlan_dev); } static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); struct net_bridge_vlan *vlan; struct net_device *vlan_dev; list_for_each_entry(vlan, &vg->vlan_list, vlist) { vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vlan->vid); if (vlan_dev) { if (br_vlan_is_dev_up(p->dev)) { if (netif_carrier_ok(p->br->dev)) netif_carrier_on(vlan_dev); } else { br_vlan_set_vlan_dev_state(p->br, vlan_dev); } } } } static void br_vlan_toggle_bridge_binding(struct net_device *br_dev, bool enable) { struct net_bridge *br = netdev_priv(br_dev); if (enable) br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); else br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, br_vlan_has_upper_bind_vlan_dev(br_dev)); } static void br_vlan_upper_change(struct net_device *dev, struct net_device *upper_dev, bool linking) { struct net_bridge *br = netdev_priv(dev); if (!br_vlan_is_bind_vlan_dev(upper_dev)) return; br_vlan_toggle_bridge_binding(dev, linking); if (linking) br_vlan_set_vlan_dev_state(br, upper_dev); } struct br_vlan_link_state_walk_data { struct net_bridge *br; }; static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, struct netdev_nested_priv *priv) { struct br_vlan_link_state_walk_data *data = priv->data; if (br_vlan_is_bind_vlan_dev(vlan_dev)) br_vlan_set_vlan_dev_state(data->br, vlan_dev); return 0; } static void br_vlan_link_state_change(struct net_device *dev, struct net_bridge *br) { struct br_vlan_link_state_walk_data data = { .br = br }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, &priv); rcu_read_unlock(); } /* Must be protected by RTNL. */ static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) { struct net_device *vlan_dev; if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); if (vlan_dev) br_vlan_set_vlan_dev_state(p->br, vlan_dev); } /* Must be protected by RTNL. */ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); int vlcmd = 0, ret = 0; bool changed = false; switch (event) { case NETDEV_REGISTER: ret = br_vlan_add(br, br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: changed = !br_vlan_delete(br, br->default_pvid); vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; br_vlan_upper_change(dev, info->upper_dev, info->linking); break; case NETDEV_CHANGE: case NETDEV_UP: if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) break; br_vlan_link_state_change(dev, br); break; } if (changed) br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev); struct net_bridge *br = netdev_priv(br_dev); bool bridge_binding; switch (event) { case NETDEV_CHANGE: case NETDEV_UP: break; default: return; } bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING; br_vlan_toggle_bridge_binding(br_dev, bridge_binding); if (bridge_binding) br_vlan_set_vlan_dev_state(br, vlan_dev); else if (!bridge_binding && netif_carrier_ok(br_dev)) netif_carrier_on(vlan_dev); } /* Must be protected by RTNL. */ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; switch (event) { case NETDEV_CHANGE: case NETDEV_DOWN: case NETDEV_UP: br_vlan_set_all_vlan_dev_state(p); break; } } static bool br_vlan_stats_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { struct pcpu_sw_netstats stats; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); if (!nest) return false; br_vlan_get_stats(v, &stats); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, u64_stats_read(&stats.tx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, u64_stats_read(&stats.tx_packets), BRIDGE_VLANDB_STATS_PAD)) goto out_err; nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, const struct net_bridge_port *p, u16 flags, bool dump_stats) { struct bridge_vlan_info info; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); if (!nest) return false; memset(&info, 0, sizeof(info)); info.vid = vid; if (flags & BRIDGE_VLAN_INFO_UNTAGGED) info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (flags & BRIDGE_VLAN_INFO_PVID) info.flags |= BRIDGE_VLAN_INFO_PVID; if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) goto out_err; if (vid_range && vid < vid_range && !(flags & BRIDGE_VLAN_INFO_PVID) && nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) goto out_err; if (v_opts) { if (!br_vlan_opts_fill(skb, v_opts, p)) goto out_err; if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) goto out_err; } nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } static size_t rtnl_vlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + br_vlan_opts_nl_size(); /* bridge vlan options */ } void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v = NULL; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 flags = 0; int ifindex; /* right now notifications are done only with rtnl held */ ASSERT_RTNL(); if (p) { ifindex = p->dev->ifindex; vg = nbp_vlan_group(p); net = dev_net(p->dev); } else { ifindex = br->dev->ifindex; vg = br_vlan_group(br); net = dev_net(br->dev); } skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); if (!nlh) goto out_err; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = AF_BRIDGE; bvm->ifindex = ifindex; switch (cmd) { case RTM_NEWVLAN: /* need to find the vlan due to flags/options */ v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) goto out_kfree; flags = v->flags; if (br_get_pvid(vg) == v->vid) flags |= BRIDGE_VLAN_INFO_PVID; break; case RTM_DELVLAN: break; default: goto out_kfree; } if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); out_kfree: kfree_skb(skb); } /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && br_vlan_opts_eq_range(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; struct nlmsghdr *nlh = NULL; struct net_bridge_port *p; struct br_vlan_msg *bvm; struct net_bridge *br; int err = 0; u16 pvid; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) return -EINVAL; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); p = NULL; } else { /* global options are dumped only for bridge devices */ if (dump_global) return 0; p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; vg = nbp_vlan_group_rcu(p); br = p->br; } if (!vg) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = PF_BRIDGE; bvm->ifindex = dev->ifindex; pvid = br_get_pvid(vg); /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; continue; } if (!range_start) { range_start = v; range_end = v; continue; } if (dump_global) { if (br_vlan_global_opts_can_enter_range(v, range_end)) goto update_end; if (!br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } else if (dump_stats || v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } update_end: range_end = v; } /* err will be 0 and range_start will be set in 3 cases here: * - first vlan (range_start == range_end) * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ if (!err && range_start) { if (dump_global && !br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) err = -EMSGSIZE; else if (!dump_global && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, br_vlan_flags(range_start, pvid), dump_stats)) err = -EMSGSIZE; } cb->args[1] = err ? idx : 0; nlmsg_end(skb, nlh); return err; } static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, }; static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; u32 dump_flags = 0; err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, br_vlan_db_dump_pol, cb->extack); if (err < 0) return err; bvm = nlmsg_data(cb->nlh); if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); rcu_read_lock(); if (bvm->ifindex) { dev = dev_get_by_index_rcu(net, bvm->ifindex); if (!dev) { err = -ENODEV; goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { if (idx < s_idx) goto skip; err = br_vlan_dump_dev(dev, skb, cb, dump_flags); if (err == -EMSGSIZE) break; skip: idx++; } } cb->args[0] = idx; rcu_read_unlock(); return skb->len; out_err: rcu_read_unlock(); return err; } static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { [BRIDGE_VLANDB_ENTRY_INFO] = NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int br_vlan_rtm_process_one(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; bool changed = false, skip_processing = false; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0, cmdmap = 0; struct net_bridge *br; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (WARN_ON(!p)) return -ENODEV; br = p->br; vg = nbp_vlan_group(p); } if (WARN_ON(!vg)) return -ENODEV; err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, br_vlan_db_policy, extack); if (err) return err; if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); return -EINVAL; } memset(&vrange_end, 0, sizeof(vrange_end)); vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | BRIDGE_VLAN_INFO_RANGE_END)) { NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); return -EINVAL; } if (!br_vlan_valid_id(vinfo->vid, extack)) return -EINVAL; if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); /* validate user-provided flags without RANGE_BEGIN */ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; /* vinfo_last is the range start, vinfo the range end */ vinfo_last = vinfo; vinfo = &vrange_end; if (!br_vlan_valid_id(vinfo->vid, extack) || !br_vlan_valid_range(vinfo, vinfo_last, extack)) return -EINVAL; } switch (cmd) { case RTM_NEWVLAN: cmdmap = RTM_SETLINK; skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); break; case RTM_DELVLAN: cmdmap = RTM_DELLINK; break; } if (!skip_processing) { struct bridge_vlan_info *tmp_last = vinfo_last; /* br_process_vlan_info may overwrite vinfo_last */ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, &changed, extack); /* notify first if anything changed */ if (changed) br_ifinfo_notify(cmdmap, br, p); if (err) return err; } /* deal with options */ if (cmd == RTM_NEWVLAN) { struct net_bridge_vlan *range_start, *range_end; if (vinfo_last) { range_start = br_vlan_find(vg, vinfo_last->vid); range_end = br_vlan_find(vg, vinfo->vid); } else { range_start = br_vlan_find(vg, vinfo->vid); range_end = range_start; } err = br_vlan_process_options(br, p, range_start, range_end, tb, extack); } return err; } static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; struct nlattr *attr; int err, vlans = 0; int rem; /* this should validate the header and check for remaining bytes */ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, extack); if (err < 0) return err; bvm = nlmsg_data(nlh); dev = __dev_get_by_index(net, bvm->ifindex); if (!dev) return -ENODEV; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); return -EINVAL; } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { switch (nla_type(attr)) { case BRIDGE_VLANDB_ENTRY: err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, extack); break; case BRIDGE_VLANDB_GLOBAL_OPTIONS: err = br_vlan_rtm_process_global_options(dev, attr, nlh->nlmsg_type, extack); break; default: continue; } vlans++; if (err) break; } if (!vlans) { NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); err = -EINVAL; } return err; } static const struct rtnl_msg_handler br_vlan_rtnl_msg_handlers[] = { {THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0}, {THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0}, }; int br_vlan_rtnl_init(void) { return rtnl_register_many(br_vlan_rtnl_msg_handlers); } void br_vlan_rtnl_uninit(void) { rtnl_unregister_many(br_vlan_rtnl_msg_handlers); }
11 11 11 11 11 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 // SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/genetlink.h> static const struct genl_multicast_group quota_mcgrps[] = { { .name = "events", }, }; /* Netlink family structure for quota */ static struct genl_family quota_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, .maxattr = QUOTA_NL_A_MAX, .mcgrps = quota_mcgrps, .n_mcgrps = ARRAY_SIZE(quota_mcgrps), }; /** * quota_send_warning - Send warning to userspace about exceeded quota * @qid: The kernel internal quota identifier. * @dev: The device on which the fs is mounted (sb->s_dev) * @warntype: The type of the warning: QUOTA_NL_... * * This can be used by filesystems (including those which don't use * dquot) to send a message to userspace relating to quota limits. * */ void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { static atomic_t seq; struct sk_buff *skb; void *msg_head; int ret; int msg_size = 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size_64bit(sizeof(u64)); /* We have to allocate using GFP_NOFS as we are called from a * filesystem performing write and thus further recursion into * the fs to free some data could cause deadlocks. */ skb = genlmsg_new(msg_size, GFP_NOFS); if (!skb) { printk(KERN_ERR "VFS: Not enough memory to send quota warning.\n"); return; } msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), &quota_genl_family, 0, QUOTA_NL_C_WARNING); if (!msg_head) { printk(KERN_ERR "VFS: Cannot store netlink header in quota warning.\n"); goto err_out; } ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_EXCESS_ID, from_kqid_munged(&init_user_ns, qid), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_CAUSED_ID, from_kuid_munged(&init_user_ns, current_uid()), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; genlmsg_end(skb, msg_head); genlmsg_multicast(&quota_genl_family, skb, 0, 0, GFP_NOFS); return; attr_err_out: printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); err_out: kfree_skb(skb); } EXPORT_SYMBOL(quota_send_warning); static int __init quota_init(void) { if (genl_register_family(&quota_genl_family) != 0) printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n"); return 0; }; fs_initcall(quota_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Resilient Queued Spin Lock defines * * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. * * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> */ #ifndef __LINUX_RQSPINLOCK_H #define __LINUX_RQSPINLOCK_H #include "../locking/qspinlock.h" /* * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value * @lock: Pointer to queued spinlock structure * @tail: The tail to compare against * @new_tail: The new queue tail code word * Return: Bool to indicate whether the cmpxchg operation succeeded * * This is used by the head of the wait queue to clean up the queue. * Provides relaxed ordering, since observers only rely on initialized * state of the node which was made visible through the xchg_tail operation, * i.e. through the smp_wmb preceding xchg_tail. * * We avoid using 16-bit cmpxchg, which is not available on all architectures. */ static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) { u32 old, new; old = atomic_read(&lock->val); do { /* * Is the tail part we compare to already stale? Fail. */ if ((old & _Q_TAIL_MASK) != tail) return false; /* * Encode latest locked/pending state for new tail. */ new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); return true; } #endif /* __LINUX_RQSPINLOCK_H */
13 11 148 11 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/uidgid_types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; struct uid_gid_map; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } static inline u32 map_id_down(struct uid_gid_map *map, u32 id) { return id; } static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { return id; } static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */
38 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM alarmtimer #if !defined(_TRACE_ALARMTIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ALARMTIMER_H #include <linux/alarmtimer.h> #include <linux/rtc.h> #include <linux/tracepoint.h> TRACE_DEFINE_ENUM(ALARM_REALTIME); TRACE_DEFINE_ENUM(ALARM_BOOTTIME); TRACE_DEFINE_ENUM(ALARM_REALTIME_FREEZER); TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER); #define show_alarm_type(type) __print_flags(type, " | ", \ { 1 << ALARM_REALTIME, "REALTIME" }, \ { 1 << ALARM_BOOTTIME, "BOOTTIME" }, \ { 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" }, \ { 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" }) #ifdef CONFIG_RTC_CLASS TRACE_EVENT(alarmtimer_suspend, TP_PROTO(ktime_t expires, int flag), TP_ARGS(expires, flag), TP_STRUCT__entry( __field(s64, expires) __field(unsigned char, alarm_type) ), TP_fast_assign( __entry->expires = expires; __entry->alarm_type = flag; ), TP_printk("alarmtimer type:%s expires:%llu", show_alarm_type((1 << __entry->alarm_type)), __entry->expires ) ); #endif /* CONFIG_RTC_CLASS */ DECLARE_EVENT_CLASS(alarm_class, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now), TP_STRUCT__entry( __field(void *, alarm) __field(unsigned char, alarm_type) __field(s64, expires) __field(s64, now) ), TP_fast_assign( __entry->alarm = alarm; __entry->alarm_type = alarm->type; __entry->expires = alarm->node.expires; __entry->now = now; ), TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu", __entry->alarm, show_alarm_type((1 << __entry->alarm_type)), __entry->expires, __entry->now ) ); DEFINE_EVENT(alarm_class, alarmtimer_fired, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); DEFINE_EVENT(alarm_class, alarmtimer_start, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); DEFINE_EVENT(alarm_class, alarmtimer_cancel, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); #endif /* _TRACE_ALARMTIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 3 3 1 3 3 1 1 16 15 15 14 14 14 14 14 14 14 14 2 16 14 14 14 13 13 13 13 13 13 13 11 1 1 13 1 13 13 4 4 4 4 4 4 4 4 4 21 21 21 21 34 33 33 33 32 34 33 13 13 13 7 7 13 2 2 13 1 13 13 13 45 3 1 1 2 1 1 1 1 3 14 2 14 4 4 3 2 2 2 2 2 2 2 2 2 4 3 3 1 1 1 3 1 3 3 3 2 2 1 1 1 1 1 3 2 1 1 1 1 9 9 4 2 3 9 9 4 6 1 4 2 3 6 4 4 4 4 4 3 3 3 1 4 4 4 1 4 4 2 1 1 1 1 2 5 5 4 4 4 4 5 9 1 9 7 4 3 3 3 2 1 1 4 4 2 2 2 4 9 14 14 14 8 8 8 29 29 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/slab.h> #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_doorbell.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" /* Use a wide upper bound for the maximum contexts. */ #define VMCI_MAX_CONTEXTS 2000 /* * List of current VMCI contexts. Contexts can be added by * vmci_ctx_create() and removed via vmci_ctx_destroy(). * These, along with context lookup, are protected by the * list structure's lock. */ static struct { struct list_head head; spinlock_t lock; /* Spinlock for context list operations */ } ctx_list = { .head = LIST_HEAD_INIT(ctx_list.head), .lock = __SPIN_LOCK_UNLOCKED(ctx_list.lock), }; /* Used by contexts that did not set up notify flag pointers */ static bool ctx_dummy_notify; static void ctx_signal_notify(struct vmci_ctx *context) { *context->notify = true; } static void ctx_clear_notify(struct vmci_ctx *context) { *context->notify = false; } /* * If nothing requires the attention of the guest, clears both * notify flag and call. */ static void ctx_clear_notify_call(struct vmci_ctx *context) { if (context->pending_datagrams == 0 && vmci_handle_arr_get_size(context->pending_doorbell_array) == 0) ctx_clear_notify(context); } /* * Sets the context's notify flag iff datagrams are pending for this * context. Called from vmci_setup_notify(). */ void vmci_ctx_check_signal_notify(struct vmci_ctx *context) { spin_lock(&context->lock); if (context->pending_datagrams) ctx_signal_notify(context); spin_unlock(&context->lock); } /* * Allocates and initializes a VMCI context. */ struct vmci_ctx *vmci_ctx_create(u32 cid, u32 priv_flags, uintptr_t event_hnd, int user_version, const struct cred *cred) { struct vmci_ctx *context; int error; if (cid == VMCI_INVALID_ID) { pr_devel("Invalid context ID for VMCI context\n"); error = -EINVAL; goto err_out; } if (priv_flags & ~VMCI_PRIVILEGE_ALL_FLAGS) { pr_devel("Invalid flag (flags=0x%x) for VMCI context\n", priv_flags); error = -EINVAL; goto err_out; } if (user_version == 0) { pr_devel("Invalid suer_version %d\n", user_version); error = -EINVAL; goto err_out; } context = kzalloc_obj(*context); if (!context) { pr_warn("Failed to allocate memory for VMCI context\n"); error = -ENOMEM; goto err_out; } kref_init(&context->kref); spin_lock_init(&context->lock); INIT_LIST_HEAD(&context->list_item); INIT_LIST_HEAD(&context->datagram_queue); INIT_LIST_HEAD(&context->notifier_list); /* Initialize host-specific VMCI context. */ init_waitqueue_head(&context->host_context.wait_queue); context->queue_pair_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_QP_COUNT); if (!context->queue_pair_array) { error = -ENOMEM; goto err_free_ctx; } context->doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->doorbell_array) { error = -ENOMEM; goto err_free_qp_array; } context->pending_doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->pending_doorbell_array) { error = -ENOMEM; goto err_free_db_array; } context->user_version = user_version; context->priv_flags = priv_flags; if (cred) context->cred = get_cred(cred); context->notify = &ctx_dummy_notify; context->notify_page = NULL; /* * If we collide with an existing context we generate a new * and use it instead. The VMX will determine if regeneration * is okay. Since there isn't 4B - 16 VMs running on a given * host, the below loop will terminate. */ spin_lock(&ctx_list.lock); while (vmci_ctx_exists(cid)) { /* We reserve the lowest 16 ids for fixed contexts. */ cid = max(cid, VMCI_RESERVED_CID_LIMIT - 1) + 1; if (cid == VMCI_INVALID_ID) cid = VMCI_RESERVED_CID_LIMIT; } context->cid = cid; list_add_tail_rcu(&context->list_item, &ctx_list.head); spin_unlock(&ctx_list.lock); return context; err_free_db_array: vmci_handle_arr_destroy(context->doorbell_array); err_free_qp_array: vmci_handle_arr_destroy(context->queue_pair_array); err_free_ctx: kfree(context); err_out: return ERR_PTR(error); } /* * Destroy VMCI context. */ void vmci_ctx_destroy(struct vmci_ctx *context) { spin_lock(&ctx_list.lock); list_del_rcu(&context->list_item); spin_unlock(&ctx_list.lock); synchronize_rcu(); vmci_ctx_put(context); } /* * Fire notification for all contexts interested in given cid. */ static int ctx_fire_notification(u32 context_id, u32 priv_flags) { u32 i, array_size; struct vmci_ctx *sub_ctx; struct vmci_handle_arr *subscriber_array; struct vmci_handle context_handle = vmci_make_handle(context_id, VMCI_EVENT_HANDLER); /* * We create an array to hold the subscribers we find when * scanning through all contexts. */ subscriber_array = vmci_handle_arr_create(0, VMCI_MAX_CONTEXTS); if (subscriber_array == NULL) return VMCI_ERROR_NO_MEM; /* * Scan all contexts to find who is interested in being * notified about given contextID. */ rcu_read_lock(); list_for_each_entry_rcu(sub_ctx, &ctx_list.head, list_item) { struct vmci_handle_list *node; /* * We only deliver notifications of the removal of * contexts, if the two contexts are allowed to * interact. */ if (vmci_deny_interaction(priv_flags, sub_ctx->priv_flags)) continue; list_for_each_entry_rcu(node, &sub_ctx->notifier_list, node) { if (!vmci_handle_is_equal(node->handle, context_handle)) continue; vmci_handle_arr_append_entry(&subscriber_array, vmci_make_handle(sub_ctx->cid, VMCI_EVENT_HANDLER)); } } rcu_read_unlock(); /* Fire event to all subscribers. */ array_size = vmci_handle_arr_get_size(subscriber_array); for (i = 0; i < array_size; i++) { int result; struct vmci_event_ctx ev; ev.msg.hdr.dst = vmci_handle_arr_get_entry(subscriber_array, i); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); memset((char*)&ev + sizeof(ev.msg.hdr), 0, ev.msg.hdr.payload_size); ev.msg.event_data.event = VMCI_EVENT_CTX_REMOVED; ev.payload.context_id = context_id; result = vmci_datagram_dispatch(VMCI_HYPERVISOR_CONTEXT_ID, &ev.msg.hdr, false); if (result < VMCI_SUCCESS) { pr_devel("Failed to enqueue event datagram (type=%d) for context (ID=0x%x)\n", ev.msg.event_data.event, ev.msg.hdr.dst.context); /* We continue to enqueue on next subscriber. */ } } vmci_handle_arr_destroy(subscriber_array); return VMCI_SUCCESS; } /* * Queues a VMCI datagram for the appropriate target VM context. */ int vmci_ctx_enqueue_datagram(u32 cid, struct vmci_datagram *dg) { struct vmci_datagram_queue_entry *dq_entry; struct vmci_ctx *context; struct vmci_handle dg_src; size_t vmci_dg_size; vmci_dg_size = VMCI_DG_SIZE(dg); if (vmci_dg_size > VMCI_MAX_DG_SIZE) { pr_devel("Datagram too large (bytes=%zu)\n", vmci_dg_size); return VMCI_ERROR_INVALID_ARGS; } /* Get the target VM's VMCI context. */ context = vmci_ctx_get(cid); if (!context) { pr_devel("Invalid context (ID=0x%x)\n", cid); return VMCI_ERROR_INVALID_ARGS; } /* Allocate guest call entry and add it to the target VM's queue. */ dq_entry = kmalloc_obj(*dq_entry); if (dq_entry == NULL) { pr_warn("Failed to allocate memory for datagram\n"); vmci_ctx_put(context); return VMCI_ERROR_NO_MEM; } dq_entry->dg = dg; dq_entry->dg_size = vmci_dg_size; dg_src = dg->src; INIT_LIST_HEAD(&dq_entry->list_item); spin_lock(&context->lock); /* * We put a higher limit on datagrams from the hypervisor. If * the pending datagram is not from hypervisor, then we check * if enqueueing it would exceed the * VMCI_MAX_DATAGRAM_QUEUE_SIZE limit on the destination. If * the pending datagram is from hypervisor, we allow it to be * queued at the destination side provided we don't reach the * VMCI_MAX_DATAGRAM_AND_EVENT_QUEUE_SIZE limit. */ if (context->datagram_queue_size + vmci_dg_size >= VMCI_MAX_DATAGRAM_QUEUE_SIZE && (!vmci_handle_is_equal(dg_src, vmci_make_handle (VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID)) || context->datagram_queue_size + vmci_dg_size >= VMCI_MAX_DATAGRAM_AND_EVENT_QUEUE_SIZE)) { spin_unlock(&context->lock); vmci_ctx_put(context); kfree(dq_entry); pr_devel("Context (ID=0x%x) receive queue is full\n", cid); return VMCI_ERROR_NO_RESOURCES; } list_add(&dq_entry->list_item, &context->datagram_queue); context->pending_datagrams++; context->datagram_queue_size += vmci_dg_size; ctx_signal_notify(context); wake_up(&context->host_context.wait_queue); spin_unlock(&context->lock); vmci_ctx_put(context); return vmci_dg_size; } /* * Verifies whether a context with the specified context ID exists. * FIXME: utility is dubious as no decisions can be reliably made * using this data as context can appear and disappear at any time. */ bool vmci_ctx_exists(u32 cid) { struct vmci_ctx *context; bool exists = false; rcu_read_lock(); list_for_each_entry_rcu(context, &ctx_list.head, list_item) { if (context->cid == cid) { exists = true; break; } } rcu_read_unlock(); return exists; } /* * Retrieves VMCI context corresponding to the given cid. */ struct vmci_ctx *vmci_ctx_get(u32 cid) { struct vmci_ctx *c, *context = NULL; if (cid == VMCI_INVALID_ID) return NULL; rcu_read_lock(); list_for_each_entry_rcu(c, &ctx_list.head, list_item) { if (c->cid == cid) { /* * The context owner drops its own reference to the * context only after removing it from the list and * waiting for RCU grace period to expire. This * means that we are not about to increase the * reference count of something that is in the * process of being destroyed. */ context = c; kref_get(&context->kref); break; } } rcu_read_unlock(); return context; } /* * Deallocates all parts of a context data structure. This * function doesn't lock the context, because it assumes that * the caller was holding the last reference to context. */ static void ctx_free_ctx(struct kref *kref) { struct vmci_ctx *context = container_of(kref, struct vmci_ctx, kref); struct vmci_datagram_queue_entry *dq_entry, *dq_entry_tmp; struct vmci_handle temp_handle; struct vmci_handle_list *notifier, *tmp; /* * Fire event to all contexts interested in knowing this * context is dying. */ ctx_fire_notification(context->cid, context->priv_flags); /* * Cleanup all queue pair resources attached to context. If * the VM dies without cleaning up, this code will make sure * that no resources are leaked. */ temp_handle = vmci_handle_arr_get_entry(context->queue_pair_array, 0); while (!vmci_handle_is_equal(temp_handle, VMCI_INVALID_HANDLE)) { if (vmci_qp_broker_detach(temp_handle, context) < VMCI_SUCCESS) { /* * When vmci_qp_broker_detach() succeeds it * removes the handle from the array. If * detach fails, we must remove the handle * ourselves. */ vmci_handle_arr_remove_entry(context->queue_pair_array, temp_handle); } temp_handle = vmci_handle_arr_get_entry(context->queue_pair_array, 0); } /* * It is fine to destroy this without locking the callQueue, as * this is the only thread having a reference to the context. */ list_for_each_entry_safe(dq_entry, dq_entry_tmp, &context->datagram_queue, list_item) { WARN_ON(dq_entry->dg_size != VMCI_DG_SIZE(dq_entry->dg)); list_del(&dq_entry->list_item); kfree(dq_entry->dg); kfree(dq_entry); } list_for_each_entry_safe(notifier, tmp, &context->notifier_list, node) { list_del(&notifier->node); kfree(notifier); } vmci_handle_arr_destroy(context->queue_pair_array); vmci_handle_arr_destroy(context->doorbell_array); vmci_handle_arr_destroy(context->pending_doorbell_array); vmci_ctx_unset_notify(context); if (context->cred) put_cred(context->cred); kfree(context); } /* * Drops reference to VMCI context. If this is the last reference to * the context it will be deallocated. A context is created with * a reference count of one, and on destroy, it is removed from * the context list before its reference count is decremented. Thus, * if we reach zero, we are sure that nobody else are about to increment * it (they need the entry in the context list for that), and so there * is no need for locking. */ void vmci_ctx_put(struct vmci_ctx *context) { kref_put(&context->kref, ctx_free_ctx); } /* * Dequeues the next datagram and returns it to caller. * The caller passes in a pointer to the max size datagram * it can handle and the datagram is only unqueued if the * size is less than max_size. If larger max_size is set to * the size of the datagram to give the caller a chance to * set up a larger buffer for the guestcall. */ int vmci_ctx_dequeue_datagram(struct vmci_ctx *context, size_t *max_size, struct vmci_datagram **dg) { struct vmci_datagram_queue_entry *dq_entry; struct list_head *list_item; int rv; /* Dequeue the next datagram entry. */ spin_lock(&context->lock); if (context->pending_datagrams == 0) { ctx_clear_notify_call(context); spin_unlock(&context->lock); pr_devel("No datagrams pending\n"); return VMCI_ERROR_NO_MORE_DATAGRAMS; } list_item = context->datagram_queue.next; dq_entry = list_entry(list_item, struct vmci_datagram_queue_entry, list_item); /* Check size of caller's buffer. */ if (*max_size < dq_entry->dg_size) { *max_size = dq_entry->dg_size; spin_unlock(&context->lock); pr_devel("Caller's buffer should be at least (size=%u bytes)\n", (u32) *max_size); return VMCI_ERROR_NO_MEM; } list_del(list_item); context->pending_datagrams--; context->datagram_queue_size -= dq_entry->dg_size; if (context->pending_datagrams == 0) { ctx_clear_notify_call(context); rv = VMCI_SUCCESS; } else { /* * Return the size of the next datagram. */ struct vmci_datagram_queue_entry *next_entry; list_item = context->datagram_queue.next; next_entry = list_entry(list_item, struct vmci_datagram_queue_entry, list_item); /* * The following size_t -> int truncation is fine as * the maximum size of a (routable) datagram is 68KB. */ rv = (int)next_entry->dg_size; } spin_unlock(&context->lock); /* Caller must free datagram. */ *dg = dq_entry->dg; dq_entry->dg = NULL; kfree(dq_entry); return rv; } /* * Reverts actions set up by vmci_setup_notify(). Unmaps and unlocks the * page mapped/locked by vmci_setup_notify(). */ void vmci_ctx_unset_notify(struct vmci_ctx *context) { struct page *notify_page; spin_lock(&context->lock); notify_page = context->notify_page; context->notify = &ctx_dummy_notify; context->notify_page = NULL; spin_unlock(&context->lock); if (notify_page) { kunmap(notify_page); put_page(notify_page); } } /* * Add remote_cid to list of contexts current contexts wants * notifications from/about. */ int vmci_ctx_add_notification(u32 context_id, u32 remote_cid) { struct vmci_ctx *context; struct vmci_handle_list *notifier, *n; int result; bool exists = false; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(remote_cid)) { pr_devel("Context removed notifications for other VMs not supported (src=0x%x, remote=0x%x)\n", context_id, remote_cid); result = VMCI_ERROR_DST_UNREACHABLE; goto out; } if (context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED) { result = VMCI_ERROR_NO_ACCESS; goto out; } notifier = kmalloc_obj(struct vmci_handle_list); if (!notifier) { result = VMCI_ERROR_NO_MEM; goto out; } INIT_LIST_HEAD(&notifier->node); notifier->handle = vmci_make_handle(remote_cid, VMCI_EVENT_HANDLER); spin_lock(&context->lock); if (context->n_notifiers < VMCI_MAX_CONTEXTS) { list_for_each_entry(n, &context->notifier_list, node) { if (vmci_handle_is_equal(n->handle, notifier->handle)) { exists = true; break; } } if (exists) { kfree(notifier); result = VMCI_ERROR_ALREADY_EXISTS; } else { list_add_tail_rcu(&notifier->node, &context->notifier_list); context->n_notifiers++; result = VMCI_SUCCESS; } } else { kfree(notifier); result = VMCI_ERROR_NO_MEM; } spin_unlock(&context->lock); out: vmci_ctx_put(context); return result; } /* * Remove remote_cid from current context's list of contexts it is * interested in getting notifications from/about. */ int vmci_ctx_remove_notification(u32 context_id, u32 remote_cid) { struct vmci_ctx *context; struct vmci_handle_list *notifier = NULL, *iter, *tmp; struct vmci_handle handle; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; handle = vmci_make_handle(remote_cid, VMCI_EVENT_HANDLER); spin_lock(&context->lock); list_for_each_entry_safe(iter, tmp, &context->notifier_list, node) { if (vmci_handle_is_equal(iter->handle, handle)) { list_del_rcu(&iter->node); context->n_notifiers--; notifier = iter; break; } } spin_unlock(&context->lock); if (notifier) kvfree_rcu_mightsleep(notifier); vmci_ctx_put(context); return notifier ? VMCI_SUCCESS : VMCI_ERROR_NOT_FOUND; } static int vmci_ctx_get_chkpt_notifiers(struct vmci_ctx *context, u32 *buf_size, void **pbuf) { u32 *notifiers; size_t data_size; struct vmci_handle_list *entry; int i = 0; if (context->n_notifiers == 0) { *buf_size = 0; *pbuf = NULL; return VMCI_SUCCESS; } data_size = context->n_notifiers * sizeof(*notifiers); if (*buf_size < data_size) { *buf_size = data_size; return VMCI_ERROR_MORE_DATA; } notifiers = kmalloc(data_size, GFP_ATOMIC); /* FIXME: want GFP_KERNEL */ if (!notifiers) return VMCI_ERROR_NO_MEM; list_for_each_entry(entry, &context->notifier_list, node) notifiers[i++] = entry->handle.context; *buf_size = data_size; *pbuf = notifiers; return VMCI_SUCCESS; } static int vmci_ctx_get_chkpt_doorbells(struct vmci_ctx *context, u32 *buf_size, void **pbuf) { struct dbell_cpt_state *dbells; u32 i, n_doorbells; n_doorbells = vmci_handle_arr_get_size(context->doorbell_array); if (n_doorbells > 0) { size_t data_size = n_doorbells * sizeof(*dbells); if (*buf_size < data_size) { *buf_size = data_size; return VMCI_ERROR_MORE_DATA; } dbells = kzalloc(data_size, GFP_ATOMIC); if (!dbells) return VMCI_ERROR_NO_MEM; for (i = 0; i < n_doorbells; i++) dbells[i].handle = vmci_handle_arr_get_entry( context->doorbell_array, i); *buf_size = data_size; *pbuf = dbells; } else { *buf_size = 0; *pbuf = NULL; } return VMCI_SUCCESS; } /* * Get current context's checkpoint state of given type. */ int vmci_ctx_get_chkpt_state(u32 context_id, u32 cpt_type, u32 *buf_size, void **pbuf) { struct vmci_ctx *context; int result; context = vmci_ctx_get(context_id); if (!context) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); switch (cpt_type) { case VMCI_NOTIFICATION_CPT_STATE: result = vmci_ctx_get_chkpt_notifiers(context, buf_size, pbuf); break; case VMCI_WELLKNOWN_CPT_STATE: /* * For compatibility with VMX'en with VM to VM communication, we * always return zero wellknown handles. */ *buf_size = 0; *pbuf = NULL; result = VMCI_SUCCESS; break; case VMCI_DOORBELL_CPT_STATE: result = vmci_ctx_get_chkpt_doorbells(context, buf_size, pbuf); break; default: pr_devel("Invalid cpt state (type=%d)\n", cpt_type); result = VMCI_ERROR_INVALID_ARGS; break; } spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Set current context's checkpoint state of given type. */ int vmci_ctx_set_chkpt_state(u32 context_id, u32 cpt_type, u32 buf_size, void *cpt_buf) { u32 i; u32 current_id; int result = VMCI_SUCCESS; u32 num_ids = buf_size / sizeof(u32); if (cpt_type == VMCI_WELLKNOWN_CPT_STATE && num_ids > 0) { /* * We would end up here if VMX with VM to VM communication * attempts to restore a checkpoint with wellknown handles. */ pr_warn("Attempt to restore checkpoint with obsolete wellknown handles\n"); return VMCI_ERROR_OBSOLETE; } if (cpt_type != VMCI_NOTIFICATION_CPT_STATE) { pr_devel("Invalid cpt state (type=%d)\n", cpt_type); return VMCI_ERROR_INVALID_ARGS; } for (i = 0; i < num_ids && result == VMCI_SUCCESS; i++) { current_id = ((u32 *)cpt_buf)[i]; result = vmci_ctx_add_notification(context_id, current_id); if (result != VMCI_SUCCESS) break; } if (result != VMCI_SUCCESS) pr_devel("Failed to set cpt state (type=%d) (error=%d)\n", cpt_type, result); return result; } /* * Retrieves the specified context's pending notifications in the * form of a handle array. The handle arrays returned are the * actual data - not a copy and should not be modified by the * caller. They must be released using * vmci_ctx_rcv_notifications_release. */ int vmci_ctx_rcv_notifications_get(u32 context_id, struct vmci_handle_arr **db_handle_array, struct vmci_handle_arr **qp_handle_array) { struct vmci_ctx *context; int result = VMCI_SUCCESS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); *db_handle_array = context->pending_doorbell_array; context->pending_doorbell_array = vmci_handle_arr_create(0, VMCI_MAX_GUEST_DOORBELL_COUNT); if (!context->pending_doorbell_array) { context->pending_doorbell_array = *db_handle_array; *db_handle_array = NULL; result = VMCI_ERROR_NO_MEM; } *qp_handle_array = NULL; spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Releases handle arrays with pending notifications previously * retrieved using vmci_ctx_rcv_notifications_get. If the * notifications were not successfully handed over to the guest, * success must be false. */ void vmci_ctx_rcv_notifications_release(u32 context_id, struct vmci_handle_arr *db_handle_array, struct vmci_handle_arr *qp_handle_array, bool success) { struct vmci_ctx *context = vmci_ctx_get(context_id); spin_lock(&context->lock); if (!success) { struct vmci_handle handle; /* * New notifications may have been added while we were not * holding the context lock, so we transfer any new pending * doorbell notifications to the old array, and reinstate the * old array. */ handle = vmci_handle_arr_remove_tail( context->pending_doorbell_array); while (!vmci_handle_is_invalid(handle)) { if (!vmci_handle_arr_has_entry(db_handle_array, handle)) { vmci_handle_arr_append_entry( &db_handle_array, handle); } handle = vmci_handle_arr_remove_tail( context->pending_doorbell_array); } vmci_handle_arr_destroy(context->pending_doorbell_array); context->pending_doorbell_array = db_handle_array; db_handle_array = NULL; } else { ctx_clear_notify_call(context); } spin_unlock(&context->lock); vmci_ctx_put(context); if (db_handle_array) vmci_handle_arr_destroy(db_handle_array); if (qp_handle_array) vmci_handle_arr_destroy(qp_handle_array); } /* * Registers that a new doorbell handle has been allocated by the * context. Only doorbell handles registered can be notified. */ int vmci_ctx_dbell_create(u32 context_id, struct vmci_handle handle) { struct vmci_ctx *context; int result; if (context_id == VMCI_INVALID_ID || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); if (!vmci_handle_arr_has_entry(context->doorbell_array, handle)) result = vmci_handle_arr_append_entry(&context->doorbell_array, handle); else result = VMCI_ERROR_DUPLICATE_ENTRY; spin_unlock(&context->lock); vmci_ctx_put(context); return result; } /* * Unregisters a doorbell handle that was previously registered * with vmci_ctx_dbell_create. */ int vmci_ctx_dbell_destroy(u32 context_id, struct vmci_handle handle) { struct vmci_ctx *context; struct vmci_handle removed_handle; if (context_id == VMCI_INVALID_ID || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; context = vmci_ctx_get(context_id); if (context == NULL) return VMCI_ERROR_NOT_FOUND; spin_lock(&context->lock); removed_handle = vmci_handle_arr_remove_entry(context->doorbell_array, handle); vmci_handle_arr_remove_entry(context->pending_doorbell_array, handle); spin_unlock(&context->lock); vmci_ctx_put(context); return vmci_handle_is_invalid(removed_handle) ? VMCI_ERROR_NOT_FOUND : VMCI_SUCCESS; } /* * Registers a notification of a doorbell handle initiated by the * specified source context. The notification of doorbells are * subject to the same isolation rules as datagram delivery. To * allow host side senders of notifications a finer granularity * of sender rights than those assigned to the sending context * itself, the host context is required to specify a different * set of privilege flags that will override the privileges of * the source context. */ int vmci_ctx_notify_dbell(u32 src_cid, struct vmci_handle handle, u32 src_priv_flags) { struct vmci_ctx *dst_context; int result; if (vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; /* Get the target VM's VMCI context. */ dst_context = vmci_ctx_get(handle.context); if (!dst_context) { pr_devel("Invalid context (ID=0x%x)\n", handle.context); return VMCI_ERROR_NOT_FOUND; } if (src_cid != handle.context) { u32 dst_priv_flags; if (VMCI_CONTEXT_IS_VM(src_cid) && VMCI_CONTEXT_IS_VM(handle.context)) { pr_devel("Doorbell notification from VM to VM not supported (src=0x%x, dst=0x%x)\n", src_cid, handle.context); result = VMCI_ERROR_DST_UNREACHABLE; goto out; } result = vmci_dbell_get_priv_flags(handle, &dst_priv_flags); if (result < VMCI_SUCCESS) { pr_warn("Failed to get privilege flags for destination (handle=0x%x:0x%x)\n", handle.context, handle.resource); goto out; } if (src_cid != VMCI_HOST_CONTEXT_ID || src_priv_flags == VMCI_NO_PRIVILEGE_FLAGS) { src_priv_flags = vmci_context_get_priv_flags(src_cid); } if (vmci_deny_interaction(src_priv_flags, dst_priv_flags)) { result = VMCI_ERROR_NO_ACCESS; goto out; } } if (handle.context == VMCI_HOST_CONTEXT_ID) { result = vmci_dbell_host_context_notify(src_cid, handle); } else { spin_lock(&dst_context->lock); if (!vmci_handle_arr_has_entry(dst_context->doorbell_array, handle)) { result = VMCI_ERROR_NOT_FOUND; } else { if (!vmci_handle_arr_has_entry( dst_context->pending_doorbell_array, handle)) { result = vmci_handle_arr_append_entry( &dst_context->pending_doorbell_array, handle); if (result == VMCI_SUCCESS) { ctx_signal_notify(dst_context); wake_up(&dst_context->host_context.wait_queue); } } else { result = VMCI_SUCCESS; } } spin_unlock(&dst_context->lock); } out: vmci_ctx_put(dst_context); return result; } bool vmci_ctx_supports_host_qp(struct vmci_ctx *context) { return context && context->user_version >= VMCI_VERSION_HOSTQP; } /* * Registers that a new queue pair handle has been allocated by * the context. */ int vmci_ctx_qp_create(struct vmci_ctx *context, struct vmci_handle handle) { int result; if (context == NULL || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; if (!vmci_handle_arr_has_entry(context->queue_pair_array, handle)) result = vmci_handle_arr_append_entry( &context->queue_pair_array, handle); else result = VMCI_ERROR_DUPLICATE_ENTRY; return result; } /* * Unregisters a queue pair handle that was previously registered * with vmci_ctx_qp_create. */ int vmci_ctx_qp_destroy(struct vmci_ctx *context, struct vmci_handle handle) { struct vmci_handle hndl; if (context == NULL || vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; hndl = vmci_handle_arr_remove_entry(context->queue_pair_array, handle); return vmci_handle_is_invalid(hndl) ? VMCI_ERROR_NOT_FOUND : VMCI_SUCCESS; } /* * Determines whether a given queue pair handle is registered * with the given context. */ bool vmci_ctx_qp_exists(struct vmci_ctx *context, struct vmci_handle handle) { if (context == NULL || vmci_handle_is_invalid(handle)) return false; return vmci_handle_arr_has_entry(context->queue_pair_array, handle); } /* * vmci_context_get_priv_flags() - Retrieve privilege flags. * @context_id: The context ID of the VMCI context. * * Retrieves privilege flags of the given VMCI context ID. */ u32 vmci_context_get_priv_flags(u32 context_id) { if (vmci_host_code_active()) { u32 flags; struct vmci_ctx *context; context = vmci_ctx_get(context_id); if (!context) return VMCI_LEAST_PRIVILEGE_FLAGS; flags = context->priv_flags; vmci_ctx_put(context); return flags; } return VMCI_NO_PRIVILEGE_FLAGS; } EXPORT_SYMBOL_GPL(vmci_context_get_priv_flags); /* * vmci_is_context_owner() - Determimnes if user is the context owner * @context_id: The context ID of the VMCI context. * @uid: The host user id (real kernel value). * * Determines whether a given UID is the owner of given VMCI context. */ bool vmci_is_context_owner(u32 context_id, kuid_t uid) { bool is_owner = false; if (vmci_host_code_active()) { struct vmci_ctx *context = vmci_ctx_get(context_id); if (context) { if (context->cred) is_owner = uid_eq(context->cred->uid, uid); vmci_ctx_put(context); } } return is_owner; } EXPORT_SYMBOL_GPL(vmci_is_context_owner);
18 18 18 20 14 14 14 12 12 12 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 12 18 18 18 18 18 5 5 5 5 5 5 18 18 18 18 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 1 18 18 14 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "bat_iv_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/string_choices.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bitarray.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); /** * enum batadv_dup_status - duplicate status */ enum batadv_dup_status { /** @BATADV_NO_DUP: the packet is no duplicate */ BATADV_NO_DUP = 0, /** * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for * the neighbor) */ BATADV_ORIG_DUP, /** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */ BATADV_NEIGH_DUP, /** * @BATADV_PROTECTED: originator is currently protected (after reboot) */ BATADV_PROTECTED, }; /** * batadv_ring_buffer_set() - update the ring buffer with the given value * @lq_recv: pointer to the ring buffer * @lq_index: index to store the value at * @value: value to store in the ring buffer */ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) { lq_recv[*lq_index] = value; *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; } /** * batadv_ring_buffer_avg() - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { const u8 *ptr; u16 count = 0; u16 i = 0; u16 sum = 0; ptr = lq_recv; while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) { if (*ptr != 0) { count++; sum += *ptr; } i++; ptr++; } if (count == 0) return 0; return (u8)(sum / count); } /** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator * @bat_priv: the bat priv with all the mesh interface information * @addr: mac address of the originator * * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) goto free_orig_node_hash; return orig_node; free_orig_node_hash: /* reference for batadv_hash_add */ batadv_orig_node_put(orig_node); /* reference from batadv_orig_node_new */ batadv_orig_node_put(orig_node); return NULL; } static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; neigh_node->orig_node = orig_neigh; out: return neigh_node; } static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; u32 random_seqno; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) { mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; } hard_iface->bat_iv.ogm_buff = ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; batadv_ogm_packet->packet_type = BATADV_IV_OGM; batadv_ogm_packet->version = BATADV_COMPAT_VERSION; batadv_ogm_packet->ttl = 2; batadv_ogm_packet->flags = BATADV_NO_FLAGS; batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; batadv_ogm_packet->ttl = BATADV_TTL; unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ static unsigned long batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) { unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) { int hop_penalty = atomic_read(&bat_priv->hop_penalty); int new_tq; new_tq = tq * (BATADV_TQ_MAX_VALUE - hop_penalty); new_tq /= BATADV_TQ_MAX_VALUE; return new_tq; } /** * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm_packet: potential OGM in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm_packet *ogm_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); return next_buff_pos <= packet_len; } /* send a batman ogm to a given interface */ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); const char *fwd_str; u8 packet_num; s16 buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; if (hard_iface->if_status != BATADV_IF_ACTIVE) return; packet_num = 0; buff_pos = 0; packet_pos = forw_packet->skb->data; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, batadv_ogm_packet)) { /* we might have aggregated direct link packets with an * ordinary base packet */ if (test_bit(packet_num, forw_packet->direct_link_flags) && forw_packet->if_incoming == hard_iface) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; if (packet_num > 0 || !forw_packet->own) fwd_str = "Forwarding"; else fwd_str = "Sending own"; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s %spacket (originator %pM, seqno %u, TQ %d, TTL %d, IDF %s) on interface %s [%pM]\n", fwd_str, (packet_num > 0 ? "aggregated " : ""), batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, str_on_off(batadv_ogm_packet->flags & BATADV_DIRECTLINK), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); buff_pos += BATADV_OGM_HLEN; buff_pos += ntohs(batadv_ogm_packet->tvlv_len); packet_num++; packet_pos = forw_packet->skb->data + buff_pos; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; } /* create clone because function is called more than once */ skb = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb) { batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } } /* send a batman ogm packet */ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { struct net_device *mesh_iface; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); return; } mesh_iface = forw_packet->if_incoming->mesh_iface; if (WARN_ON(!forw_packet->if_outgoing)) return; if (forw_packet->if_outgoing->mesh_iface != mesh_iface) { pr_warn("%s: mesh interface switch for queued OGM\n", __func__); return; } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) return; /* only for one specific outgoing interface */ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); } /** * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an * existing forward packet * @new_bat_ogm_packet: OGM packet to be aggregated * @bat_priv: the bat priv with all the mesh interface information * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @directlink: true if this is a direct link packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, struct batadv_priv *bat_priv, int packet_len, unsigned long send_time, bool directlink, const struct batadv_hard_iface *if_incoming, const struct batadv_hard_iface *if_outgoing, const struct batadv_forw_packet *forw_packet) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned int aggregated_bytes = forw_packet->packet_len + packet_len; struct batadv_hard_iface *primary_if = NULL; u8 packet_num = forw_packet->num_packets; bool res = false; unsigned long aggregation_end_time; unsigned int max_bytes; batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data; aggregation_end_time = send_time; aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); max_bytes = min_t(unsigned int, if_outgoing->net_dev->mtu, BATADV_MAX_AGGREGATION_BYTES); /* we can aggregate the current packet to this aggregated packet * if: * * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet won't be bigger than * MAX_AGGREGATION_BYTES and MTU of the outgoing interface * - the number of packets is lower than MAX_AGGREGATION_PACKETS * otherwise aggregation is not possible */ if (!time_before(send_time, forw_packet->send_time) || !time_after_eq(aggregation_end_time, forw_packet->send_time)) return false; if (aggregated_bytes > max_bytes) return false; if (skb_tailroom(forw_packet->skb) < packet_len) return false; if (packet_num >= BATADV_MAX_AGGREGATION_PACKETS) return false; /* packet is not leaving on the same interface. */ if (forw_packet->if_outgoing != if_outgoing) return false; /* check aggregation compatibility * -> direct link packets are broadcasted on * their interface only * -> aggregate packet if the current packet is * a "global" packet as well as the base * packet */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return false; /* packets without direct link flag and high TTL * are flooded through the net */ if (!directlink && !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && batadv_ogm_packet->ttl != 1 && /* own packets originating non-primary * interfaces leave only that interface */ (!forw_packet->own || forw_packet->if_incoming == primary_if)) { res = true; goto out; } /* if the incoming packet is sent via this one * interface only - we still can aggregate */ if (directlink && new_bat_ogm_packet->ttl == 1 && forw_packet->if_incoming == if_incoming && /* packets from direct neighbors or * own secondary interface packets * (= secondary interface packets in general) */ (batadv_ogm_packet->flags & BATADV_DIRECTLINK || (forw_packet->own && forw_packet->if_incoming != primary_if))) { res = true; goto out; } out: batadv_hardif_put(primary_if); return res; } /** * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this * packet to it. * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @direct_link: whether this OGM has direct link status * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm */ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_forw_packet *forw_packet_aggr; struct sk_buff *skb; unsigned char *skb_buff; unsigned int skb_size; atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; if (atomic_read(&bat_priv->aggregated_ogms)) skb_size = max_t(unsigned int, BATADV_MAX_AGGREGATION_BYTES, packet_len); else skb_size = packet_len; skb_size += ETH_HLEN; skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!skb) return; forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, queue_left, bat_priv, skb); if (!forw_packet_aggr) { kfree_skb(skb); return; } forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); skb_buff = skb_put(forw_packet_aggr->skb, packet_len); forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); forw_packet_aggr->own = own_packet; bitmap_zero(forw_packet_aggr->direct_link_flags, BATADV_MAX_AGGREGATION_PACKETS); forw_packet_aggr->send_time = send_time; /* save packet direct link flag status */ if (direct_link) set_bit(0, forw_packet_aggr->direct_link_flags); INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, batadv_iv_send_outstanding_bat_ogm_packet); batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time); } /* aggregate a new packet into the existing ogm packet */ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, const unsigned char *packet_buff, int packet_len, bool direct_link) { skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len); forw_packet_aggr->packet_len += packet_len; /* save packet direct link flag status */ if (direct_link) set_bit(forw_packet_aggr->num_packets, forw_packet_aggr->direct_link_flags); forw_packet_aggr->num_packets++; } /** * batadv_iv_ogm_queue_add() - queue up an OGM for transmission * @bat_priv: the bat priv with all the mesh interface information * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm * @send_time: timestamp (jiffies) when the packet is to be sent */ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet, unsigned long send_time) { /* _aggr -> pointer to the packet we want to aggregate with * _pos -> pointer to the position in the queue */ struct batadv_forw_packet *forw_packet_aggr = NULL; struct batadv_forw_packet *forw_packet_pos = NULL; struct batadv_ogm_packet *batadv_ogm_packet; bool direct_link; unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, bat_priv, packet_len, send_time, direct_link, if_incoming, if_outgoing, forw_packet_pos)) { forw_packet_aggr = forw_packet_pos; break; } } } /* nothing to aggregate with - either aggregation disabled or no * suitable aggregation packet found */ if (!forw_packet_aggr) { /* the following section can run without the lock */ spin_unlock_bh(&bat_priv->forw_bat_list_lock); /* if we could not aggregate this packet with one of the others * we hold it back for a while, so that it might be aggregated * later on */ if (!own_packet && atomic_read(&bat_priv->aggregated_ogms)) send_time += max_aggregation_jiffies; batadv_iv_ogm_aggregate_new(packet_buff, packet_len, send_time, direct_link, if_incoming, if_outgoing, own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); spin_unlock_bh(&bat_priv->forw_bat_list_lock); } } static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, const struct ethhdr *ethhdr, struct batadv_ogm_packet *batadv_ogm_packet, bool is_single_hop_neigh, bool is_from_best_next_hop, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); return; } if (!is_from_best_next_hop) { /* Mark the forwarded packet when it is not coming from our * best next hop. We still need to forward the packet for our * neighbor link quality detection to work in case the packet * originated from a single hop neighbor. Otherwise we can * simply drop the ogm. */ if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_NOT_BEST_NEXT_HOP; else return; } tvlv_len = ntohs(batadv_ogm_packet->tvlv_len); batadv_ogm_packet->ttl--; ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source); /* apply hop penalty */ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq, bat_priv); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: tq: %i, ttl: %i\n", batadv_ogm_packet->tq, batadv_ogm_packet->ttl); if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet, BATADV_OGM_HLEN + tvlv_len, if_incoming, if_outgoing, 0, batadv_iv_ogm_fwd_send_time()); } /** * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows * for the given interface * @hard_iface: the interface for which the windows have to be shifted */ static void batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; unsigned long *word; u32 i; u8 *w; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { if (orig_ifinfo->if_outgoing != hard_iface) continue; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); word = orig_ifinfo->bat_iv.bcast_own; batadv_bit_get_packet(bat_priv, word, 1, 0); w = &orig_ifinfo->bat_iv.bcast_own_sum; *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); } } rcu_read_unlock(); } } /** * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer * @hard_iface: interface whose ogm buffer should be transmitted */ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; struct list_head *iter; u32 seqno; u16 tvlv_len = 0; unsigned long send_time; lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); /* interface already disabled by batadv_iv_ogm_iface_disable */ if (!*ogm_buff) return; /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in * hardif_activate_interface() where the originator mac is set and * outdated packets (especially uninitialized mac addresses) in the * packet queue */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, ogm_buff_len, BATADV_OGM_HLEN); } batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff); batadv_ogm_packet->tvlv_len = htons(tvlv_len); /* change sequence number to network order */ seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno); batadv_ogm_packet->seqno = htonl(seqno); atomic_inc(&hard_iface->bat_iv.ogm_seqno); batadv_iv_ogm_slide_own_bcast_window(hard_iface); send_time = batadv_iv_ogm_emit_send_time(bat_priv); if (hard_iface != primary_if) { /* OGMs from secondary interfaces are only scheduled on their * respective interfaces. */ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, hard_iface, 1, send_time); goto out; } /* OGMs from primary interfaces are scheduled on all * interfaces. */ rcu_read_lock(); netdev_for_each_lower_private_rcu(hard_iface->mesh_iface, tmp_hard_iface, iter) { if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); batadv_hardif_put(tmp_hard_iface); } rcu_read_unlock(); out: batadv_hardif_put(primary_if); } static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); batadv_iv_ogm_schedule_buff(hard_iface); mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast * * Return: Number of replied (rebroadcasted) OGMs which were transmitted by * an originator and directly (without intermediate hop) received by a specific * interface */ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; u8 sum; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); if (!orig_ifinfo) return 0; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); sum = orig_ifinfo->bat_iv.bcast_own_sum; spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); return sum; } /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the orig node who originally emitted the ogm packet * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node * @ethhdr: Ethernet header of the OGM * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @dup_status: the duplicate status of this ogm packet. */ static void batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo *orig_ifinfo, const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, enum batadv_dup_status dup_status) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s(): Searching and updating originator entry of received packet\n", __func__); rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } if (dup_status != BATADV_NO_DUP) continue; /* only update the entry for this outgoing interface */ neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node, if_outgoing); if (!neigh_ifinfo) continue; spin_lock_bh(&tmp_neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, 0); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } if (!neigh_node) { struct batadv_orig_node *orig_tmp; orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_tmp) goto unlock; neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, orig_node, orig_tmp); batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; neigh_node->last_seen = jiffies; spin_lock_bh(&neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, batadv_ogm_packet->tq); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&neigh_node->ifinfo_lock); if (dup_status == BATADV_NO_DUP) { orig_ifinfo->last_ttl = batadv_ogm_packet->ttl; neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl; } /* if this neighbor already is our next hop there is nothing * to change */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router == neigh_node) goto out; if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); if (!router_ifinfo) goto out; /* if this neighbor does not offer a better TQ we won't * consider it */ if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg) goto out; } /* if the TQ is the same and the link not more symmetric we * won't consider it either */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, router->if_incoming); sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, neigh_node->if_incoming); if (sum_orig >= sum_neigh) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); goto out; unlock: rcu_read_unlock(); out: batadv_neigh_node_put(neigh_node); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet * @orig_node: the orig node who originally emitted the ogm packet * @orig_neigh_node: the orig node struct of the neighbor who sent the packet * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: true if the link can be considered bidirectional, false otherwise */ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; bool ret = false; /* find corresponding one hop neighbor */ rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_neigh_node->neigh_list, list) { if (!batadv_compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig)) continue; if (tmp_neigh_node->if_incoming != if_incoming) continue; if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; break; } rcu_read_unlock(); if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, orig_neigh_node, orig_neigh_node); if (!neigh_node) goto out; /* if orig_node is direct neighbor update neigh_node last_seen */ if (orig_node == orig_neigh_node) neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) total_count = neigh_rq_count; else total_count = orig_eq_count; /* if we have too few packets (too less data) we set tq_own to zero * if we receive too few packets it is not considered bidirectional */ if (total_count < BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM || neigh_rq_count < BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM) tq_own = 0; else /* neigh_node->real_packet_count is never zero as we * only purge old information when getting new * information */ tq_own = (BATADV_TQ_MAX_VALUE * total_count) / neigh_rq_count; /* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does * affect the nearly-symmetric links only a little, but * punishes asymmetric links more. This will give a value * between 0 and TQ_MAX_VALUE */ neigh_rq_inv = BATADV_TQ_LOCAL_WINDOW_SIZE - neigh_rq_count; neigh_rq_inv_cube = neigh_rq_inv * neigh_rq_inv * neigh_rq_inv; neigh_rq_max_cube = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE; inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_hop_penalty, batadv_ogm_packet->tq, if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality * consider it bidirectional */ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT) ret = true; out: batadv_neigh_node_put(neigh_node); return ret; } /** * batadv_iv_ogm_update_seqnos() - process a batman packet for all interfaces, * adjust the sequence number and find out whether it is a duplicate * @ethhdr: ethernet header of the packet * @batadv_ogm_packet: OGM packet to be considered * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, const struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; bool is_dup; s32 seq_diff; bool need_update = false; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; u32 seqno = ntohl(batadv_ogm_packet->seqno); u8 *neigh_addr; u8 packet_count; unsigned long *bitmap; orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); if (!orig_node) return BATADV_NO_DUP; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { batadv_orig_node_put(orig_node); return 0; } spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); seq_diff = seqno - orig_ifinfo->last_real_seqno; /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_TQ_LOCAL_WINDOW_SIZE, &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } rcu_read_lock(); hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) continue; neigh_addr = neigh_node->addr; is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits, orig_ifinfo->last_real_seqno, seqno); if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && neigh_node->if_incoming == if_incoming) { set_mark = 1; if (is_dup) ret = BATADV_NEIGH_DUP; } else { set_mark = 0; if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } /* if the window moved, set the update flag. */ bitmap = neigh_ifinfo->bat_iv.real_bits; need_update |= batadv_bit_get_packet(bat_priv, bitmap, seq_diff, set_mark); packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); if (need_update) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s updating last_seqno: old %u, new %u\n", if_outgoing ? if_outgoing->net_dev->name : "DEFAULT", orig_ifinfo->last_real_seqno, seqno); orig_ifinfo->last_real_seqno = seqno; } out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_put(orig_node); batadv_orig_ifinfo_put(orig_ifinfo); return ret; } /** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface * @skb: the skb containing the OGM * @ogm_offset: offset from skb->data to start of ogm header * @orig_node: the (cached) orig node for the originator of this OGM * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_ogm_packet *ogm_packet; enum batadv_dup_status dup_status; bool is_from_best_next_hop = false; bool is_single_hop_neigh = false; bool sameseq, similar_ttl; struct sk_buff *skb_priv; struct ethhdr *ethhdr; u8 *prev_sender; bool is_bidirect; /* create a private copy of the skb, as some functions change tq value * and/or flags. */ skb_priv = skb_copy(skb, GFP_ATOMIC); if (!skb_priv) return; ethhdr = eth_hdr(skb_priv); ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset); dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet, if_incoming, if_outgoing); if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig)) is_single_hop_neigh = true; if (dup_status == BATADV_PROTECTED) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within seqno protection time (sender: %pM)\n", ethhdr->h_source); goto out; } if (ogm_packet->tq == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with tq equal 0\n"); goto out; } if (is_single_hop_neigh) { hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (hardif_neigh) hardif_neigh->last_seen = jiffies; } router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { router_router = batadv_orig_router_get(router->orig_node, if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) && (batadv_compare_eth(router->addr, ethhdr->h_source))) is_from_best_next_hop = true; prev_sender = ogm_packet->prev_sender; /* avoid temporary routing loops */ if (router && router_router && (batadv_compare_eth(router->addr, prev_sender)) && !(batadv_compare_eth(ogm_packet->orig, prev_sender)) && (batadv_compare_eth(router->addr, router_router->addr))) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n", ethhdr->h_source); goto out; } if (if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node); /* if sender is a direct neighbor the sender mac equals * originator mac */ if (is_single_hop_neigh) orig_neigh_node = orig_node; else orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; } is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node, ogm_packet, if_incoming, if_outgoing); /* update ranking if it is not a duplicate or has the same * seqno and similar ttl as the non-duplicate */ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out_neigh; sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, ogm_packet, if_incoming, if_outgoing, dup_status); } batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out_neigh; /* is single hop (direct) neighbor */ if (is_single_hop_neigh) { /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ if (ogm_packet->ttl <= 2 && if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; } /* mark direct link on incoming interface */ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast neighbor packet with direct link flag\n"); goto out_neigh; } /* multihop originator */ if (!is_bidirect) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: not received via bidirectional link\n"); goto out_neigh; } if (dup_status == BATADV_NEIGH_DUP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: duplicate packet received\n"); goto out_neigh; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast originator packet\n"); batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); out_neigh: if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); batadv_neigh_node_put(router_router); batadv_neigh_node_put(orig_neigh_router); batadv_hardif_neigh_put(hardif_neigh); consume_skb(skb_priv); } /** * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it * @ogm_packet: rebroadcast OGM packet to process * @if_incoming: the interface where this packet was received * @orig_node: originator which reproadcasted the OGMs * @if_incoming_seqno: OGM sequence number when rebroadcast was received */ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_orig_node *orig_node, u32 if_incoming_seqno) { struct batadv_orig_ifinfo *orig_ifinfo; s32 bit_pos; u8 *weight; /* neighbor has to indicate direct link and it has to * come via the corresponding interface */ if (!(ogm_packet->flags & BATADV_DIRECTLINK)) return; if (!batadv_compare_eth(if_incoming->net_dev->dev_addr, ogm_packet->orig)) return; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming); if (!orig_ifinfo) return; /* save packet seqno for bidirectional check */ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); bit_pos = if_incoming_seqno - 2; bit_pos -= ntohl(ogm_packet->seqno); batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos); weight = &orig_ifinfo->bat_iv.bcast_own_sum; *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); } /** * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; u32 if_incoming_seqno; bool has_directlink_flag; struct ethhdr *ethhdr; bool is_my_oldorig = false; bool is_my_addr = false; bool is_my_orig = false; struct list_head *iter; ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset); ethhdr = eth_hdr(skb); /* Silently drop when the batman packet is actually not a * correct packet. * * This might happen if a packet is padded (e.g. Ethernet has a * minimum frame length of 64 byte) and the aggregation interprets * it as an additional length. * * TODO: A more sane solution would be to have a bit in the * batadv_ogm_packet to detect whether the packet is the last * packet in an aggregation. Here we expect that the padding * is always zero (or not 0x01) */ if (ogm_packet->packet_type != BATADV_IV_OGM) return; /* could be changed by schedule_own_packet() */ if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno); if (ogm_packet->flags & BATADV_DIRECTLINK) has_directlink_flag = true; else has_directlink_flag = false; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ogm_packet->prev_sender, ntohl(ogm_packet->seqno), ogm_packet->tq, ogm_packet->ttl, ogm_packet->version, has_directlink_flag); rcu_read_lock(); netdev_for_each_lower_private_rcu(if_incoming->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (batadv_compare_eth(ethhdr->h_source, hard_iface->net_dev->dev_addr)) is_my_addr = true; if (batadv_compare_eth(ogm_packet->orig, hard_iface->net_dev->dev_addr)) is_my_orig = true; if (batadv_compare_eth(ogm_packet->prev_sender, hard_iface->net_dev->dev_addr)) is_my_oldorig = true; } rcu_read_unlock(); if (is_my_addr) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: received my own broadcast (sender: %pM)\n", ethhdr->h_source); return; } if (is_my_orig) { orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) return; batadv_iv_ogm_process_reply(ogm_packet, if_incoming, orig_neigh_node, if_incoming_seqno); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); batadv_orig_node_put(orig_neigh_node); return; } if (is_my_oldorig) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n", ethhdr->h_source); return; } if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n", ethhdr->h_source); return; } orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) return; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); batadv_orig_node_put(orig_node); } static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_forw_packet *forw_packet; struct batadv_priv *bat_priv; bool dropped = false; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; goto out; } batadv_iv_ogm_emit(forw_packet); /* we have to have at least one packet in the queue to determine the * queues wake up time unless we are shutting down. * * only re-schedule if this is the "original" copy, e.g. the OGM of the * primary interface should only be rescheduled once per period, but * this function will be called for the forw_packet instances of the * other secondary interfaces as well. */ if (forw_packet->own && forw_packet->if_incoming == forw_packet->if_outgoing) batadv_iv_ogm_schedule(forw_packet->if_incoming); out: /* do we get something for free()? */ if (batadv_forw_packet_steal(forw_packet, &bat_priv->forw_bat_list_lock)) batadv_forw_packet_free(forw_packet, dropped); } static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_ogm_packet *ogm_packet; u8 *packet_pos; int ogm_offset; bool res; int ret = NET_RX_DROP; res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN); if (!res) goto free_skb; /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm_packet *)skb->data; /* unpack the aggregated packets and process them one by one */ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_iv_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a * given outgoing interface. * @neigh_node: Neighbour of interest * @if_outgoing: Outgoing interface of interest * @tq_avg: Pointer of where to store the TQ average * * Return: False if no average TQ available, otherwise true. */ static bool batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_outgoing, u8 *tq_avg) { struct batadv_neigh_ifinfo *n_ifinfo; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return false; *tq_avg = n_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(n_ifinfo); return true; } /** * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { void *hdr; u8 tq_avg; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) return 0; if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; u8 tq_avg_best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, &tq_avg_best)) goto out; if (tq_avg_best == 0) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_iv_ogm_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_iv_ogm_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } /** * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * @diff: pointer to integer receiving the calculated difference * * The content of *@diff is only valid when this function returns true. * It is less, equal to or greater than 0 if the metric via neigh1 is lower, * the same as or higher than the metric via neigh2 * * Return: true when the difference could be calculated, false otherwise */ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2, int *diff) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; u8 tq1, tq2; bool ret = true; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!neigh1_ifinfo || !neigh2_ifinfo) { ret = false; goto out; } tq1 = neigh1_ifinfo->bat_iv.tq_avg; tq2 = neigh2_ifinfo->bat_iv.tq_avg; *diff = (int)tq1 - (int)tq2; out: batadv_neigh_ifinfo_put(neigh1_ifinfo); batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } /** * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface * into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the mesh interface information * @hard_iface: Hard interface to dump the neighbours for * @idx_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the mesh interface information * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; struct list_head *iter; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (i_hardif++ < i_hardif_s) continue; if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return 0; return diff; } /** * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better * than neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return false; ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD; return ret; } static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); } /** * batadv_iv_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) { /* set default TQ difference threshold to 20 */ atomic_set(&bat_priv->gw.sel_class, 20); } static struct batadv_gw_node * batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; u64 max_gw_factor = 0; u64 tmp_gw_factor = 0; u8 max_tq = 0; u8 tq_avg; struct batadv_orig_node *orig_node; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) continue; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto next; if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; switch (atomic_read(&bat_priv->gw.sel_class)) { case 1: /* fast connection */ tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; if (tmp_gw_factor > max_gw_factor || (tmp_gw_factor == max_gw_factor && tq_avg > max_tq)) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; default: /* 2: stable connection (use best statistic) * 3: fast-switch (use best statistic but change as * soon as a better gateway appears) * XX: late-switch (use best statistic but change as * soon as a better gateway appears which has * $routing_class more tq points) */ if (tq_avg > max_tq) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; } if (tq_avg > max_tq) max_tq = tq_avg; if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; batadv_gw_node_put(gw_node); next: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); return curr_gw; } static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL; struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL; struct batadv_neigh_node *router_gw = NULL; struct batadv_neigh_node *router_orig = NULL; u8 gw_tq_avg, orig_tq_avg; bool ret = false; /* dynamic re-election is performed only on fast or late switch */ if (atomic_read(&bat_priv->gw.sel_class) <= 2) return false; router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); if (!router_gw) { ret = true; goto out; } router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw, BATADV_IF_DEFAULT); if (!router_gw_ifinfo) { ret = true; goto out; } router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router_orig) goto out; router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig, BATADV_IF_DEFAULT); if (!router_orig_ifinfo) goto out; gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg; orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg; /* the TQ value has to be better */ if (orig_tq_avg < gw_tq_avg) goto out; /* if the routing class is greater than 3 the value tells us how much * greater the TQ value of the new gateway must be */ if ((atomic_read(&bat_priv->gw.sel_class) > 3) && (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", gw_tq_avg, orig_tq_avg); ret = true; out: batadv_neigh_ifinfo_put(router_gw_ifinfo); batadv_neigh_ifinfo_put(router_orig_ifinfo); batadv_neigh_node_put(router_gw); batadv_neigh_node_put(router_orig); return ret; } /** * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the mesh interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) || nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_iv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the mesh interface information */ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { .enable = batadv_iv_ogm_iface_enable, .enabled = batadv_iv_iface_enabled, .disable = batadv_iv_ogm_iface_disable, .update_mac = batadv_iv_ogm_iface_update_mac, .primary_set = batadv_iv_ogm_primary_iface_set, }, .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .dump = batadv_iv_ogm_neigh_dump, }, .orig = { .dump = batadv_iv_ogm_orig_dump, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, .sel_class_max = BATADV_TQ_MAX_VALUE, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, .dump = batadv_iv_gw_dump, }, }; /** * batadv_iv_init() - B.A.T.M.A.N. IV initialization function * * Return: 0 on success or negative error number in case of failure */ int __init batadv_iv_init(void) { int ret; /* batman originator packet */ ret = batadv_recv_handler_register(BATADV_IV_OGM, batadv_iv_ogm_receive); if (ret < 0) goto out; ret = batadv_algo_register(&batadv_batman_iv); if (ret < 0) goto handler_unregister; goto out; handler_unregister: batadv_recv_handler_unregister(BATADV_IV_OGM); out: return ret; }
45 136 151 160 303 4 306 297 23 13 12 9 9 8 6 7 13 227 88 35 13 35 35 35 35 35 4 33 33 2 2 14 14 14 14 14 14 33 33 33 33 26 7 26 33 33 33 33 11 11 7 11 11 10 10 10 10 9 10 10 10 10 9 9 4 4 3 3 9 8 1 8 6 255 254 254 255 2 26 26 3 26 2 2 56 55 44 3 40 252 251 7 7 248 252 255 255 252 252 251 250 2 250 255 255 26 222 220 42 208 203 204 204 54 203 203 203 42 210 221 222 218 253 253 13 254 254 222 255 52 53 53 53 3 1 2 3 3 53 75 1 74 1 73 14 75 62 59 59 56 3 30 258 253 260 30 27 3 53 53 53 53 53 53 52 53 53 52 52 52 52 25 24 48 50 45 48 25 53 53 13 13 11 11 11 9 13 1 2 1 1 3 3 1 1 1 4 4 4 4 2 3 3 2 4 3 2 2 1 2 2 4 119 118 118 2 118 5 117 116 113 112 113 115 1 116 119 31 31 30 32 9 9 8 8 8 2 1 2 129 109 129 54 109 123 53 127 123 123 123 123 122 122 120 121 118 120 118 31 31 30 31 29 30 28 29 28 26 26 126 126 126 126 1 126 31 1 2 2 1 1 5 3 2 2 1 5 2 2 2 1 3 3 1 1 1 2 3 3 2 1 29 34 6 5 6 6 1 2 2 1 1 12 11 12 10 10 10 10 6 4 8 6 5 6 61 62 62 61 61 5 61 54 62 61 60 56 2 2 17 61 4 3 2 2 61 57 57 62 34 9 6 33 32 34 7 7 5 6 5 6 6 4 5 4 2 2 4 1 3 2 1 1 2 4 3 1 4 2 2 2 4 3 3 2 1 1 124 122 124 123 121 121 122 121 75 66 42 124 2 2 2 2 2 2 2 3 3 3 3 25 25 25 24 25 24 24 24 228 224 228 227 228 225 163 163 163 158 158 64 64 64 6 6 8 8 8 8 8 8 3 8 4 8 1 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 3 3 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Client Manager * Copyright (c) 1998-2001 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <sound/core.h> #include <sound/minors.h> #include <linux/kmod.h> #include <sound/seq_kernel.h> #include <sound/ump.h> #include "seq_clientmgr.h" #include "seq_memory.h" #include "seq_queue.h" #include "seq_timer.h" #include "seq_info.h" #include "seq_system.h" #include "seq_ump_convert.h" #include <sound/seq_device.h> #ifdef CONFIG_COMPAT #include <linux/compat.h> #endif /* Client Manager * this module handles the connections of userland and kernel clients * */ /* * There are four ranges of client numbers (last two shared): * 0..15: global clients * 16..127: statically allocated client numbers for cards 0..27 * 128..191: dynamically allocated client numbers for cards 28..31 * 128..191: dynamically allocated client numbers for applications */ /* number of kernel non-card clients */ #define SNDRV_SEQ_GLOBAL_CLIENTS 16 /* clients per cards, for static clients */ #define SNDRV_SEQ_CLIENTS_PER_CARD 4 /* dynamically allocated client numbers (both kernel drivers and user space) */ #define SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN 128 #define SNDRV_SEQ_LFLG_INPUT 0x0001 #define SNDRV_SEQ_LFLG_OUTPUT 0x0002 #define SNDRV_SEQ_LFLG_OPEN (SNDRV_SEQ_LFLG_INPUT|SNDRV_SEQ_LFLG_OUTPUT) static DEFINE_SPINLOCK(clients_lock); static DEFINE_MUTEX(register_mutex); /* * client table */ static char clienttablock[SNDRV_SEQ_MAX_CLIENTS]; static struct snd_seq_client *clienttab[SNDRV_SEQ_MAX_CLIENTS]; static struct snd_seq_usage client_usage; /* * prototypes */ static int bounce_error_event(struct snd_seq_client *client, struct snd_seq_event *event, int err, int atomic, int hop); static int snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) static void free_ump_info(struct snd_seq_client *client); #endif /* */ static inline unsigned short snd_seq_file_flags(struct file *file) { switch (file->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_WRITE: return SNDRV_SEQ_LFLG_OUTPUT; case FMODE_READ: return SNDRV_SEQ_LFLG_INPUT; default: return SNDRV_SEQ_LFLG_OPEN; } } static inline int snd_seq_write_pool_allocated(struct snd_seq_client *client) { return snd_seq_total_cells(client->pool) > 0; } /* return pointer to client structure for specified id */ static struct snd_seq_client *clientptr(int clientid) { if (clientid < 0 || clientid >= SNDRV_SEQ_MAX_CLIENTS) { pr_debug("ALSA: seq: oops. Trying to get pointer to client %d\n", clientid); return NULL; } return clienttab[clientid]; } static struct snd_seq_client *client_use_ptr(int clientid, bool load_module) { struct snd_seq_client *client; if (clientid < 0 || clientid >= SNDRV_SEQ_MAX_CLIENTS) { pr_debug("ALSA: seq: oops. Trying to get pointer to client %d\n", clientid); return NULL; } scoped_guard(spinlock_irqsave, &clients_lock) { client = clientptr(clientid); if (client) return snd_seq_client_ref(client); if (clienttablock[clientid]) return NULL; } #ifdef CONFIG_MODULES if (load_module) { static DECLARE_BITMAP(client_requested, SNDRV_SEQ_GLOBAL_CLIENTS); static DECLARE_BITMAP(card_requested, SNDRV_CARDS); if (clientid < SNDRV_SEQ_GLOBAL_CLIENTS) { int idx; if (!test_and_set_bit(clientid, client_requested)) { for (idx = 0; idx < 15; idx++) { if (seq_client_load[idx] < 0) break; if (seq_client_load[idx] == clientid) { request_module("snd-seq-client-%i", clientid); break; } } } } else if (clientid < SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN) { int card = (clientid - SNDRV_SEQ_GLOBAL_CLIENTS) / SNDRV_SEQ_CLIENTS_PER_CARD; if (card < snd_ecards_limit) { if (!test_and_set_bit(card, card_requested)) snd_request_card(card); snd_seq_device_load_drivers(); } } scoped_guard(spinlock_irqsave, &clients_lock) { client = clientptr(clientid); if (client) return snd_seq_client_ref(client); } } #endif return NULL; } /* get snd_seq_client object for the given id quickly */ struct snd_seq_client *snd_seq_client_use_ptr(int clientid) { return client_use_ptr(clientid, false); } /* get snd_seq_client object for the given id; * if not found, retry after loading the modules */ static struct snd_seq_client *client_load_and_use_ptr(int clientid) { return client_use_ptr(clientid, IS_ENABLED(CONFIG_MODULES)); } static void usage_alloc(struct snd_seq_usage *res, int num) { res->cur += num; if (res->cur > res->peak) res->peak = res->cur; } static void usage_free(struct snd_seq_usage *res, int num) { res->cur -= num; } /* initialise data structures */ int __init client_init_data(void) { /* zap out the client table */ memset(&clienttablock, 0, sizeof(clienttablock)); memset(&clienttab, 0, sizeof(clienttab)); return 0; } static struct snd_seq_client *seq_create_client1(int client_index, int poolsize) { int c; struct snd_seq_client *client; /* init client data */ client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) return NULL; client->pool = snd_seq_pool_new(poolsize); if (client->pool == NULL) { kfree(client); return NULL; } client->type = NO_CLIENT; snd_use_lock_init(&client->use_lock); rwlock_init(&client->ports_lock); mutex_init(&client->ports_mutex); INIT_LIST_HEAD(&client->ports_list_head); mutex_init(&client->ioctl_mutex); client->ump_endpoint_port = -1; /* find free slot in the client table */ scoped_guard(spinlock_irq, &clients_lock) { if (client_index < 0) { for (c = SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN; c < SNDRV_SEQ_MAX_CLIENTS; c++) { if (clienttab[c] || clienttablock[c]) continue; clienttab[client->number = c] = client; return client; } } else { if (clienttab[client_index] == NULL && !clienttablock[client_index]) { clienttab[client->number = client_index] = client; return client; } } } snd_seq_pool_delete(&client->pool); kfree(client); return NULL; /* no free slot found or busy, return failure code */ } static int seq_free_client1(struct snd_seq_client *client) { if (!client) return 0; scoped_guard(spinlock_irq, &clients_lock) { clienttablock[client->number] = 1; clienttab[client->number] = NULL; } snd_seq_delete_all_ports(client); snd_seq_queue_client_leave(client->number); snd_use_lock_sync(&client->use_lock); if (client->pool) snd_seq_pool_delete(&client->pool); scoped_guard(spinlock_irq, &clients_lock) { clienttablock[client->number] = 0; } return 0; } static void seq_free_client(struct snd_seq_client * client) { scoped_guard(mutex, &register_mutex) { switch (client->type) { case NO_CLIENT: pr_warn("ALSA: seq: Trying to free unused client %d\n", client->number); break; case USER_CLIENT: case KERNEL_CLIENT: seq_free_client1(client); usage_free(&client_usage, 1); break; default: pr_err("ALSA: seq: Trying to free client %d with undefined type = %d\n", client->number, client->type); } } snd_seq_system_client_ev_client_exit(client->number); } /* -------------------------------------------------------- */ /* create a user client */ static int snd_seq_open(struct inode *inode, struct file *file) { int c, mode; /* client id */ struct snd_seq_client *client; struct snd_seq_user_client *user; int err; err = stream_open(inode, file); if (err < 0) return err; scoped_guard(mutex, &register_mutex) { client = seq_create_client1(-1, SNDRV_SEQ_DEFAULT_EVENTS); if (!client) return -ENOMEM; /* failure code */ mode = snd_seq_file_flags(file); if (mode & SNDRV_SEQ_LFLG_INPUT) client->accept_input = 1; if (mode & SNDRV_SEQ_LFLG_OUTPUT) client->accept_output = 1; user = &client->data.user; user->fifo = NULL; user->fifo_pool_size = 0; if (mode & SNDRV_SEQ_LFLG_INPUT) { user->fifo_pool_size = SNDRV_SEQ_DEFAULT_CLIENT_EVENTS; user->fifo = snd_seq_fifo_new(user->fifo_pool_size); if (user->fifo == NULL) { seq_free_client1(client); kfree(client); return -ENOMEM; } } usage_alloc(&client_usage, 1); client->type = USER_CLIENT; } c = client->number; file->private_data = client; /* fill client data */ user->file = file; sprintf(client->name, "Client-%d", c); client->data.user.owner = get_pid(task_pid(current)); /* make others aware this new client */ snd_seq_system_client_ev_client_start(c); return 0; } /* delete a user client */ static int snd_seq_release(struct inode *inode, struct file *file) { struct snd_seq_client *client = file->private_data; if (client) { seq_free_client(client); if (client->data.user.fifo) snd_seq_fifo_delete(&client->data.user.fifo); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) free_ump_info(client); #endif put_pid(client->data.user.owner); kfree(client); } return 0; } static bool event_is_compatible(const struct snd_seq_client *client, const struct snd_seq_event *ev) { if (snd_seq_ev_is_ump(ev) && !client->midi_version) return false; if (snd_seq_ev_is_ump(ev) && snd_seq_ev_is_variable(ev)) return false; return true; } /* handle client read() */ /* possible error values: * -ENXIO invalid client or file open mode * -ENOSPC FIFO overflow (the flag is cleared after this error report) * -EINVAL no enough user-space buffer to write the whole event * -EFAULT seg. fault during copy to user space */ static ssize_t snd_seq_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct snd_seq_client *client = file->private_data; struct snd_seq_fifo *fifo; size_t aligned_size; int err; long result = 0; struct snd_seq_event_cell *cell; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT)) return -ENXIO; if (!access_ok(buf, count)) return -EFAULT; /* check client structures are in place */ if (snd_BUG_ON(!client)) return -ENXIO; if (!client->accept_input) return -ENXIO; fifo = client->data.user.fifo; if (!fifo) return -ENXIO; if (atomic_read(&fifo->overflow) > 0) { /* buffer overflow is detected */ snd_seq_fifo_clear(fifo); /* return error code */ return -ENOSPC; } cell = NULL; err = 0; guard(snd_seq_fifo)(fifo); if (IS_ENABLED(CONFIG_SND_SEQ_UMP) && client->midi_version > 0) aligned_size = sizeof(struct snd_seq_ump_event); else aligned_size = sizeof(struct snd_seq_event); /* while data available in queue */ while (count >= aligned_size) { int nonblock; nonblock = (file->f_flags & O_NONBLOCK) || result > 0; err = snd_seq_fifo_cell_out(fifo, &cell, nonblock); if (err < 0) break; if (!event_is_compatible(client, &cell->event)) { snd_seq_cell_free(cell); cell = NULL; continue; } if (snd_seq_ev_is_variable(&cell->event)) { struct snd_seq_ump_event tmpev; memcpy(&tmpev, &cell->event, aligned_size); tmpev.data.ext.len &= ~SNDRV_SEQ_EXT_MASK; if (copy_to_user(buf, &tmpev, aligned_size)) { err = -EFAULT; break; } count -= aligned_size; buf += aligned_size; err = snd_seq_expand_var_event(&cell->event, count, (char __force *)buf, 0, aligned_size); if (err < 0) break; result += err; count -= err; buf += err; } else { if (copy_to_user(buf, &cell->event, aligned_size)) { err = -EFAULT; break; } count -= aligned_size; buf += aligned_size; } snd_seq_cell_free(cell); cell = NULL; /* to be sure */ result += aligned_size; } if (err < 0) { if (cell) snd_seq_fifo_cell_putback(fifo, cell); if (err == -EAGAIN && result > 0) err = 0; } return (err < 0) ? err : result; } /* * check access permission to the port */ static int check_port_perm(struct snd_seq_client_port *port, unsigned int flags) { if ((port->capability & flags) != flags) return 0; return flags; } /* * check if the destination client is available, and return the pointer */ static struct snd_seq_client *get_event_dest_client(struct snd_seq_event *event) { struct snd_seq_client *dest __free(snd_seq_client) = snd_seq_client_use_ptr(event->dest.client); if (dest == NULL) return NULL; if (! dest->accept_input) return NULL; if (snd_seq_ev_is_ump(event)) return no_free_ptr(dest); /* ok - no filter checks */ if ((dest->filter & SNDRV_SEQ_FILTER_USE_EVENT) && ! test_bit(event->type, dest->event_filter)) return NULL; return no_free_ptr(dest); /* ok - accessible */ } /* * Return the error event. * * If the receiver client is a user client, the original event is * encapsulated in SNDRV_SEQ_EVENT_BOUNCE as variable length event. If * the original event is also variable length, the external data is * copied after the event record. * If the receiver client is a kernel client, the original event is * quoted in SNDRV_SEQ_EVENT_KERNEL_ERROR, since this requires no extra * kmalloc. */ static int bounce_error_event(struct snd_seq_client *client, struct snd_seq_event *event, int err, int atomic, int hop) { struct snd_seq_event bounce_ev; int result; if (client == NULL || ! (client->filter & SNDRV_SEQ_FILTER_BOUNCE) || ! client->accept_input) return 0; /* ignored */ /* set up quoted error */ memset(&bounce_ev, 0, sizeof(bounce_ev)); bounce_ev.type = SNDRV_SEQ_EVENT_KERNEL_ERROR; bounce_ev.flags = SNDRV_SEQ_EVENT_LENGTH_FIXED; bounce_ev.queue = SNDRV_SEQ_QUEUE_DIRECT; bounce_ev.source.client = SNDRV_SEQ_CLIENT_SYSTEM; bounce_ev.source.port = SNDRV_SEQ_PORT_SYSTEM_ANNOUNCE; bounce_ev.dest.client = client->number; bounce_ev.dest.port = event->source.port; bounce_ev.data.quote.origin = event->dest; bounce_ev.data.quote.event = event; bounce_ev.data.quote.value = -err; /* use positive value */ result = snd_seq_deliver_single_event(NULL, &bounce_ev, atomic, hop + 1); if (result < 0) { client->event_lost++; return result; } return result; } /* * rewrite the time-stamp of the event record with the curren time * of the given queue. * return non-zero if updated. */ static int update_timestamp_of_queue(struct snd_seq_event *event, int queue, int real_time) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(queue); if (! q) return 0; event->queue = queue; event->flags &= ~SNDRV_SEQ_TIME_STAMP_MASK; if (real_time) { event->time.time = snd_seq_timer_get_cur_time(q->timer, true); event->flags |= SNDRV_SEQ_TIME_STAMP_REAL; } else { event->time.tick = snd_seq_timer_get_cur_tick(q->timer); event->flags |= SNDRV_SEQ_TIME_STAMP_TICK; } return 1; } /* deliver a single event; called from below and UMP converter */ int __snd_seq_deliver_single_event(struct snd_seq_client *dest, struct snd_seq_client_port *dest_port, struct snd_seq_event *event, int atomic, int hop) { switch (dest->type) { case USER_CLIENT: if (!dest->data.user.fifo) return 0; return snd_seq_fifo_event_in(dest->data.user.fifo, event); case KERNEL_CLIENT: if (!dest_port->event_input) return 0; return dest_port->event_input(event, snd_seq_ev_is_direct(event), dest_port->private_data, atomic, hop); } return 0; } /* deliver a single event; called from snd_seq_deliver_single_event() */ static int _snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { struct snd_seq_client *dest __free(snd_seq_client) = get_event_dest_client(event); if (dest == NULL) return -ENOENT; struct snd_seq_client_port *dest_port __free(snd_seq_port) = snd_seq_port_use_ptr(dest, event->dest.port); if (dest_port == NULL) return -ENOENT; /* check permission */ if (!check_port_perm(dest_port, SNDRV_SEQ_PORT_CAP_WRITE)) return -EPERM; if (dest_port->timestamping) update_timestamp_of_queue(event, dest_port->time_queue, dest_port->time_real); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (snd_seq_ev_is_ump(event)) { if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) return snd_seq_deliver_from_ump(client, dest, dest_port, event, atomic, hop); else if (dest->type == USER_CLIENT && !snd_seq_client_is_ump(dest)) return 0; // drop the event } else if (snd_seq_client_is_ump(dest)) { if (!(dest->filter & SNDRV_SEQ_FILTER_NO_CONVERT)) return snd_seq_deliver_to_ump(client, dest, dest_port, event, atomic, hop); } #endif /* CONFIG_SND_SEQ_UMP */ return __snd_seq_deliver_single_event(dest, dest_port, event, atomic, hop); } /* * deliver an event to the specified destination. * if filter is non-zero, client filter bitmap is tested. * * RETURN VALUE: 0 : if succeeded * <0 : error */ static int snd_seq_deliver_single_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int result = _snd_seq_deliver_single_event(client, event, atomic, hop); if (result < 0 && !snd_seq_ev_is_direct(event)) return bounce_error_event(client, event, result, atomic, hop); return result; } /* * send the event to all subscribers: */ static int __deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, int port, int atomic, int hop) { struct snd_seq_subscribers *subs; int err, result = 0, num_ev = 0; union __snd_seq_event event_saved; size_t saved_size; struct snd_seq_port_subs_info *grp; if (port < 0) return 0; struct snd_seq_client_port *src_port __free(snd_seq_port) = snd_seq_port_use_ptr(client, port); if (!src_port) return 0; /* save original event record */ saved_size = snd_seq_event_packet_size(event); memcpy(&event_saved, event, saved_size); grp = &src_port->c_src; /* lock list */ if (atomic) read_lock(&grp->list_lock); else down_read_nested(&grp->list_mutex, hop); list_for_each_entry(subs, &grp->list_head, src_list) { /* both ports ready? */ if (atomic_read(&subs->ref_count) != 2) continue; event->dest = subs->info.dest; if (subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP) /* convert time according to flag with subscription */ update_timestamp_of_queue(event, subs->info.queue, subs->info.flags & SNDRV_SEQ_PORT_SUBS_TIME_REAL); err = snd_seq_deliver_single_event(client, event, atomic, hop); if (err < 0) { /* save first error that occurs and continue */ if (!result) result = err; continue; } num_ev++; /* restore original event record */ memcpy(event, &event_saved, saved_size); } if (atomic) read_unlock(&grp->list_lock); else up_read(&grp->list_mutex); memcpy(event, &event_saved, saved_size); return (result < 0) ? result : num_ev; } static int deliver_to_subscribers(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int ret; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) int ret2; #endif ret = __deliver_to_subscribers(client, event, event->source.port, atomic, hop); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (!snd_seq_client_is_ump(client) || client->ump_endpoint_port < 0) return ret; /* If it's an event from EP port (and with a UMP group), * deliver to subscribers of the corresponding UMP group port, too. * Or, if it's from non-EP port, deliver to subscribers of EP port, too. */ if (event->source.port == client->ump_endpoint_port) ret2 = __deliver_to_subscribers(client, event, snd_seq_ump_group_port(event), atomic, hop); else ret2 = __deliver_to_subscribers(client, event, client->ump_endpoint_port, atomic, hop); if (ret2 < 0) return ret2; #endif return ret; } /* deliver an event to the destination port(s). * if the event is to subscribers or broadcast, the event is dispatched * to multiple targets. * * RETURN VALUE: n > 0 : the number of delivered events. * n == 0 : the event was not passed to any client. * n < 0 : error - event was not processed. */ static int snd_seq_deliver_event(struct snd_seq_client *client, struct snd_seq_event *event, int atomic, int hop) { int result; hop++; if (hop >= SNDRV_SEQ_MAX_HOPS) { pr_debug("ALSA: seq: too long delivery path (%d:%d->%d:%d)\n", event->source.client, event->source.port, event->dest.client, event->dest.port); return -EMLINK; } if (snd_seq_ev_is_variable(event) && snd_BUG_ON(atomic && (event->data.ext.len & SNDRV_SEQ_EXT_USRPTR))) return -EINVAL; if (event->queue == SNDRV_SEQ_ADDRESS_SUBSCRIBERS || event->dest.client == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) result = deliver_to_subscribers(client, event, atomic, hop); else result = snd_seq_deliver_single_event(client, event, atomic, hop); return result; } /* * dispatch an event cell: * This function is called only from queue check routines in timer * interrupts or after enqueued. * The event cell shall be released or re-queued in this function. * * RETURN VALUE: n > 0 : the number of delivered events. * n == 0 : the event was not passed to any client. * n < 0 : error - event was not processed. */ int snd_seq_dispatch_event(struct snd_seq_event_cell *cell, int atomic, int hop) { int result; if (snd_BUG_ON(!cell)) return -EINVAL; struct snd_seq_client *client __free(snd_seq_client) = snd_seq_client_use_ptr(cell->event.source.client); if (client == NULL) { snd_seq_cell_free(cell); /* release this cell */ return -EINVAL; } if (!snd_seq_ev_is_ump(&cell->event) && cell->event.type == SNDRV_SEQ_EVENT_NOTE) { /* NOTE event: * the event cell is re-used as a NOTE-OFF event and * enqueued again. */ struct snd_seq_event tmpev, *ev; /* reserve this event to enqueue note-off later */ tmpev = cell->event; tmpev.type = SNDRV_SEQ_EVENT_NOTEON; result = snd_seq_deliver_event(client, &tmpev, atomic, hop); /* * This was originally a note event. We now re-use the * cell for the note-off event. */ ev = &cell->event; ev->type = SNDRV_SEQ_EVENT_NOTEOFF; ev->flags |= SNDRV_SEQ_PRIORITY_HIGH; /* add the duration time */ switch (ev->flags & SNDRV_SEQ_TIME_STAMP_MASK) { case SNDRV_SEQ_TIME_STAMP_TICK: cell->event.time.tick += ev->data.note.duration; break; case SNDRV_SEQ_TIME_STAMP_REAL: /* unit for duration is ms */ ev->time.time.tv_nsec += 1000000 * (ev->data.note.duration % 1000); ev->time.time.tv_sec += ev->data.note.duration / 1000 + ev->time.time.tv_nsec / 1000000000; ev->time.time.tv_nsec %= 1000000000; break; } ev->data.note.velocity = ev->data.note.off_velocity; /* Now queue this cell as the note off event */ if (snd_seq_enqueue_event(cell, atomic, hop) < 0) snd_seq_cell_free(cell); /* release this cell */ } else { /* Normal events: * event cell is freed after processing the event */ result = snd_seq_deliver_event(client, &cell->event, atomic, hop); snd_seq_cell_free(cell); } return result; } /* Allocate a cell from client pool and enqueue it to queue: * if pool is empty and blocking is TRUE, sleep until a new cell is * available. */ static int snd_seq_client_enqueue_event(struct snd_seq_client *client, struct snd_seq_event *event, struct file *file, int blocking, int atomic, int hop, struct mutex *mutexp) { struct snd_seq_event_cell *cell; int err; /* special queue values - force direct passing */ if (event->queue == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) { event->dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; event->queue = SNDRV_SEQ_QUEUE_DIRECT; } else if (event->dest.client == SNDRV_SEQ_ADDRESS_SUBSCRIBERS) { /* check presence of source port */ struct snd_seq_client_port *src_port __free(snd_seq_port) = snd_seq_port_use_ptr(client, event->source.port); if (!src_port) return -EINVAL; } /* direct event processing without enqueued */ if (snd_seq_ev_is_direct(event)) { if (!snd_seq_ev_is_ump(event) && event->type == SNDRV_SEQ_EVENT_NOTE) return -EINVAL; /* this event must be enqueued! */ return snd_seq_deliver_event(client, event, atomic, hop); } /* Not direct, normal queuing */ if (snd_seq_queue_is_used(event->queue, client->number) <= 0) return -EINVAL; /* invalid queue */ if (! snd_seq_write_pool_allocated(client)) return -ENXIO; /* queue is not allocated */ /* allocate an event cell */ err = snd_seq_event_dup(client->pool, event, &cell, !blocking || atomic, file, mutexp); if (err < 0) return err; /* we got a cell. enqueue it. */ err = snd_seq_enqueue_event(cell, atomic, hop); if (err < 0) { snd_seq_cell_free(cell); return err; } return 0; } /* * check validity of event type and data length. * return non-zero if invalid. */ static int check_event_type_and_length(struct snd_seq_event *ev) { switch (snd_seq_ev_length_type(ev)) { case SNDRV_SEQ_EVENT_LENGTH_FIXED: if (snd_seq_ev_is_variable_type(ev)) return -EINVAL; break; case SNDRV_SEQ_EVENT_LENGTH_VARIABLE: if (! snd_seq_ev_is_variable_type(ev) || (ev->data.ext.len & ~SNDRV_SEQ_EXT_MASK) >= SNDRV_SEQ_MAX_EVENT_LEN) return -EINVAL; break; case SNDRV_SEQ_EVENT_LENGTH_VARUSR: if (! snd_seq_ev_is_direct(ev)) return -EINVAL; break; } return 0; } /* handle write() */ /* possible error values: * -ENXIO invalid client or file open mode * -ENOMEM malloc failed * -EFAULT seg. fault during copy from user space * -EINVAL invalid event * -EAGAIN no space in output pool * -EINTR interrupts while sleep * -EMLINK too many hops * others depends on return value from driver callback */ static ssize_t snd_seq_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct snd_seq_client *client = file->private_data; int written = 0, len; int err, handled; union __snd_seq_event __event; struct snd_seq_event *ev = &__event.legacy; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT)) return -ENXIO; /* check client structures are in place */ if (snd_BUG_ON(!client)) return -ENXIO; if (!client->accept_output || client->pool == NULL) return -ENXIO; repeat: handled = 0; /* allocate the pool now if the pool is not allocated yet */ mutex_lock(&client->ioctl_mutex); if (client->pool->size > 0 && !snd_seq_write_pool_allocated(client)) { err = snd_seq_pool_init(client->pool); if (err < 0) goto out; } /* only process whole events */ err = -EINVAL; while (count >= sizeof(struct snd_seq_event)) { /* Read in the event header from the user */ len = sizeof(struct snd_seq_event); if (copy_from_user(ev, buf, len)) { err = -EFAULT; break; } /* read in the rest bytes for UMP events */ if (snd_seq_ev_is_ump(ev)) { if (count < sizeof(struct snd_seq_ump_event)) break; if (copy_from_user((char *)ev + len, buf + len, sizeof(struct snd_seq_ump_event) - len)) { err = -EFAULT; break; } len = sizeof(struct snd_seq_ump_event); } ev->source.client = client->number; /* fill in client number */ /* Check for extension data length */ if (check_event_type_and_length(ev)) { err = -EINVAL; break; } if (!event_is_compatible(client, ev)) { err = -EINVAL; break; } /* check for special events */ if (!snd_seq_ev_is_ump(ev)) { if (ev->type == SNDRV_SEQ_EVENT_NONE) goto __skip_event; else if (snd_seq_ev_is_reserved(ev)) { err = -EINVAL; break; } } if (snd_seq_ev_is_variable(ev)) { int extlen = ev->data.ext.len & ~SNDRV_SEQ_EXT_MASK; if ((size_t)(extlen + len) > count) { /* back out, will get an error this time or next */ err = -EINVAL; break; } /* set user space pointer */ ev->data.ext.len = extlen | SNDRV_SEQ_EXT_USRPTR; ev->data.ext.ptr = (char __force *)buf + len; len += extlen; /* increment data length */ } else { #ifdef CONFIG_COMPAT if (client->convert32 && snd_seq_ev_is_varusr(ev)) ev->data.ext.ptr = (void __force *)compat_ptr(ev->data.raw32.d[1]); #endif } /* ok, enqueue it */ err = snd_seq_client_enqueue_event(client, ev, file, !(file->f_flags & O_NONBLOCK), 0, 0, &client->ioctl_mutex); if (err < 0) break; handled++; __skip_event: /* Update pointers and counts */ count -= len; buf += len; written += len; /* let's have a coffee break if too many events are queued */ if (++handled >= 200) { mutex_unlock(&client->ioctl_mutex); goto repeat; } } out: mutex_unlock(&client->ioctl_mutex); return written ? written : err; } /* * handle polling */ static __poll_t snd_seq_poll(struct file *file, poll_table * wait) { struct snd_seq_client *client = file->private_data; __poll_t mask = 0; /* check client structures are in place */ if (snd_BUG_ON(!client)) return EPOLLERR; if ((snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT) && client->data.user.fifo) { /* check if data is available in the outqueue */ if (snd_seq_fifo_poll_wait(client->data.user.fifo, file, wait)) mask |= EPOLLIN | EPOLLRDNORM; } if (snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT) { /* check if data is available in the pool */ if (snd_seq_pool_poll_wait(client->pool, file, wait)) mask |= EPOLLOUT | EPOLLWRNORM; } return mask; } /*-----------------------------------------------------*/ static int snd_seq_ioctl_pversion(struct snd_seq_client *client, void *arg) { int *pversion = arg; *pversion = SNDRV_SEQ_VERSION; return 0; } static int snd_seq_ioctl_user_pversion(struct snd_seq_client *client, void *arg) { client->user_pversion = *(unsigned int *)arg; return 0; } static int snd_seq_ioctl_client_id(struct snd_seq_client *client, void *arg) { int *client_id = arg; *client_id = client->number; return 0; } /* SYSTEM_INFO ioctl() */ static int snd_seq_ioctl_system_info(struct snd_seq_client *client, void *arg) { struct snd_seq_system_info *info = arg; memset(info, 0, sizeof(*info)); /* fill the info fields */ info->queues = SNDRV_SEQ_MAX_QUEUES; info->clients = SNDRV_SEQ_MAX_CLIENTS; info->ports = SNDRV_SEQ_MAX_PORTS; info->channels = 256; /* fixed limit */ info->cur_clients = client_usage.cur; info->cur_queues = snd_seq_queue_get_cur_queues(); return 0; } /* RUNNING_MODE ioctl() */ static int snd_seq_ioctl_running_mode(struct snd_seq_client *client, void *arg) { struct snd_seq_running_info *info = arg; /* requested client number */ struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ #ifdef SNDRV_BIG_ENDIAN if (!info->big_endian) return -EINVAL; #else if (info->big_endian) return -EINVAL; #endif if (info->cpu_mode > sizeof(long)) return -EINVAL; cptr->convert32 = (info->cpu_mode < sizeof(long)); return 0; } /* CLIENT_INFO ioctl() */ static void get_client_info(struct snd_seq_client *cptr, struct snd_seq_client_info *info) { info->client = cptr->number; /* fill the info fields */ info->type = cptr->type; strscpy(info->name, cptr->name); info->filter = cptr->filter; info->event_lost = cptr->event_lost; memcpy(info->event_filter, cptr->event_filter, 32); info->group_filter = cptr->group_filter; info->num_ports = cptr->num_ports; if (cptr->type == USER_CLIENT) info->pid = pid_vnr(cptr->data.user.owner); else info->pid = -1; if (cptr->type == KERNEL_CLIENT) info->card = cptr->data.kernel.card ? cptr->data.kernel.card->number : -1; else info->card = -1; info->midi_version = cptr->midi_version; memset(info->reserved, 0, sizeof(info->reserved)); } static int snd_seq_ioctl_get_client_info(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *client_info = arg; /* requested client number */ struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(client_info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ get_client_info(cptr, client_info); return 0; } /* CLIENT_INFO ioctl() */ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *client_info = arg; /* it is not allowed to set the info fields for an another client */ if (client->number != client_info->client) return -EPERM; /* also client type must be set now */ if (client->type != client_info->type) return -EINVAL; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) { /* check validity of midi_version field */ if (client_info->midi_version > SNDRV_SEQ_CLIENT_UMP_MIDI_2_0) return -EINVAL; /* check if UMP is supported in kernel */ if (!IS_ENABLED(CONFIG_SND_SEQ_UMP) && client_info->midi_version > 0) return -EINVAL; } /* fill the info fields */ if (client_info->name[0]) strscpy(client->name, client_info->name, sizeof(client->name)); client->filter = client_info->filter; client->event_lost = client_info->event_lost; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) client->midi_version = client_info->midi_version; memcpy(client->event_filter, client_info->event_filter, 32); client->group_filter = client_info->group_filter; /* notify the change */ snd_seq_system_client_ev_client_change(client->number); return 0; } /* * CREATE PORT ioctl() */ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client_port *port; struct snd_seq_port_callback *callback; int port_idx, err; /* it is not allowed to create the port for an another client */ if (info->addr.client != client->number) return -EPERM; if (client->type == USER_CLIENT && info->kernel) return -EINVAL; if ((info->capability & SNDRV_SEQ_PORT_CAP_UMP_ENDPOINT) && client->ump_endpoint_port >= 0) return -EBUSY; if (info->flags & SNDRV_SEQ_PORT_FLG_GIVEN_PORT) port_idx = info->addr.port; else port_idx = -1; if (port_idx >= SNDRV_SEQ_ADDRESS_UNKNOWN) return -EINVAL; err = snd_seq_create_port(client, port_idx, &port); if (err < 0) return err; if (client->type == KERNEL_CLIENT) { callback = info->kernel; if (callback) { if (callback->owner) port->owner = callback->owner; port->private_data = callback->private_data; port->private_free = callback->private_free; port->event_input = callback->event_input; port->c_src.open = callback->subscribe; port->c_src.close = callback->unsubscribe; port->c_dest.open = callback->use; port->c_dest.close = callback->unuse; } } info->addr = port->addr; snd_seq_set_port_info(port, info); if (info->capability & SNDRV_SEQ_PORT_CAP_UMP_ENDPOINT) client->ump_endpoint_port = port->addr.port; snd_seq_system_client_ev_port_start(port->addr.client, port->addr.port); snd_seq_port_unlock(port); return 0; } /* * DELETE PORT ioctl() */ static int snd_seq_ioctl_delete_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; int err; /* it is not allowed to remove the port for an another client */ if (info->addr.client != client->number) return -EPERM; err = snd_seq_delete_port(client, info->addr.port); if (err >= 0) { if (client->ump_endpoint_port == info->addr.port) client->ump_endpoint_port = -1; snd_seq_system_client_ev_port_exit(client->number, info->addr.port); } return err; } /* * GET_PORT_INFO ioctl() (on any client) */ static int snd_seq_ioctl_get_port_info(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_use_ptr(cptr, info->addr.port); if (port == NULL) return -ENOENT; /* don't change */ /* get port info */ snd_seq_get_port_info(port, info); return 0; } /* * SET_PORT_INFO ioctl() (only ports on this/own client) */ static int snd_seq_ioctl_set_port_info(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; if (info->addr.client != client->number) /* only set our own ports ! */ return -EPERM; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_use_ptr(client, info->addr.port); if (port) { snd_seq_set_port_info(port, info); /* notify the change */ snd_seq_system_client_ev_port_change(info->addr.client, info->addr.port); } return 0; } /* * port subscription (connection) */ #define PERM_RD (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) #define PERM_WR (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) static int check_subscription_permission(struct snd_seq_client *client, struct snd_seq_client_port *sport, struct snd_seq_client_port *dport, struct snd_seq_port_subscribe *subs) { if (client->number != subs->sender.client && client->number != subs->dest.client) { /* connection by third client - check export permission */ if (check_port_perm(sport, SNDRV_SEQ_PORT_CAP_NO_EXPORT)) return -EPERM; if (check_port_perm(dport, SNDRV_SEQ_PORT_CAP_NO_EXPORT)) return -EPERM; } /* check read permission */ /* if sender or receiver is the subscribing client itself, * no permission check is necessary */ if (client->number != subs->sender.client) { if (! check_port_perm(sport, PERM_RD)) return -EPERM; } /* check write permission */ if (client->number != subs->dest.client) { if (! check_port_perm(dport, PERM_WR)) return -EPERM; } return 0; } /* * send an subscription notify event to user client: * client must be user client. */ int snd_seq_client_notify_subscription(int client, int port, struct snd_seq_port_subscribe *info, int evtype) { struct snd_seq_event event; memset(&event, 0, sizeof(event)); event.type = evtype; event.data.connect.dest = info->dest; event.data.connect.sender = info->sender; return snd_seq_system_notify(client, port, &event, false); /* non-atomic */ } /* * add to port's subscription list IOCTL interface */ static int snd_seq_ioctl_subscribe_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result; struct snd_seq_client *receiver __free(snd_seq_client) = client_load_and_use_ptr(subs->dest.client); if (!receiver) return -EINVAL; struct snd_seq_client *sender __free(snd_seq_client) = client_load_and_use_ptr(subs->sender.client); if (!sender) return -EINVAL; struct snd_seq_client_port *sport __free(snd_seq_port) = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) return -EINVAL; struct snd_seq_client_port *dport __free(snd_seq_port) = snd_seq_port_use_ptr(receiver, subs->dest.port); if (!dport) return -EINVAL; result = check_subscription_permission(client, sport, dport, subs); if (result < 0) return result; /* connect them */ result = snd_seq_port_connect(client, sender, sport, receiver, dport, subs); if (! result) /* broadcast announce */ snd_seq_client_notify_subscription(SNDRV_SEQ_ADDRESS_SUBSCRIBERS, 0, subs, SNDRV_SEQ_EVENT_PORT_SUBSCRIBED); return result; } /* * remove from port's subscription list */ static int snd_seq_ioctl_unsubscribe_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; int result; struct snd_seq_client *receiver __free(snd_seq_client) = snd_seq_client_use_ptr(subs->dest.client); if (!receiver) return -ENXIO; struct snd_seq_client *sender __free(snd_seq_client) = snd_seq_client_use_ptr(subs->sender.client); if (!sender) return -ENXIO; struct snd_seq_client_port *sport __free(snd_seq_port) = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) return -ENXIO; struct snd_seq_client_port *dport __free(snd_seq_port) = snd_seq_port_use_ptr(receiver, subs->dest.port); if (!dport) return -ENXIO; result = check_subscription_permission(client, sport, dport, subs); if (result < 0) return result; result = snd_seq_port_disconnect(client, sender, sport, receiver, dport, subs); if (! result) /* broadcast announce */ snd_seq_client_notify_subscription(SNDRV_SEQ_ADDRESS_SUBSCRIBERS, 0, subs, SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED); return result; } /* CREATE_QUEUE ioctl() */ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q __free(snd_seq_queue) = snd_seq_queue_alloc(client->number, info->locked, info->flags); if (IS_ERR(q)) return PTR_ERR(q); info->queue = q->queue; info->locked = q->locked; info->owner = q->owner; /* set queue name */ if (!info->name[0]) snprintf(info->name, sizeof(info->name), "Queue-%d", q->queue); strscpy(q->name, info->name, sizeof(q->name)); return 0; } /* DELETE_QUEUE ioctl() */ static int snd_seq_ioctl_delete_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; return snd_seq_queue_delete(client->number, info->queue); } /* GET_QUEUE_INFO ioctl() */ static int snd_seq_ioctl_get_queue_info(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(info->queue); if (q == NULL) return -EINVAL; memset(info, 0, sizeof(*info)); info->queue = q->queue; info->owner = q->owner; info->locked = q->locked; strscpy(info->name, q->name, sizeof(info->name)); return 0; } /* SET_QUEUE_INFO ioctl() */ static int snd_seq_ioctl_set_queue_info(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; if (info->owner != client->number) return -EINVAL; /* change owner/locked permission */ if (snd_seq_queue_check_access(info->queue, client->number)) { if (snd_seq_queue_set_owner(info->queue, client->number, info->locked) < 0) return -EPERM; if (info->locked) snd_seq_queue_use(info->queue, client->number, 1); } else { return -EPERM; } struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(info->queue); if (! q) return -EINVAL; if (q->owner != client->number) return -EPERM; strscpy(q->name, info->name, sizeof(q->name)); return 0; } /* GET_NAMED_QUEUE ioctl() */ static int snd_seq_ioctl_get_named_queue(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_info *info = arg; struct snd_seq_queue *q __free(snd_seq_queue) = snd_seq_queue_find_name(info->name); if (q == NULL) return -EINVAL; info->queue = q->queue; info->owner = q->owner; info->locked = q->locked; return 0; } /* GET_QUEUE_STATUS ioctl() */ static int snd_seq_ioctl_get_queue_status(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_status *status = arg; struct snd_seq_timer *tmr; struct snd_seq_queue *queue __free(snd_seq_queue) = queueptr(status->queue); if (queue == NULL) return -EINVAL; memset(status, 0, sizeof(*status)); status->queue = queue->queue; tmr = queue->timer; status->events = queue->tickq->cells + queue->timeq->cells; status->time = snd_seq_timer_get_cur_time(tmr, true); status->tick = snd_seq_timer_get_cur_tick(tmr); status->running = tmr->running; status->flags = queue->flags; return 0; } /* GET_QUEUE_TEMPO ioctl() */ static int snd_seq_ioctl_get_queue_tempo(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_tempo *tempo = arg; struct snd_seq_timer *tmr; struct snd_seq_queue *queue __free(snd_seq_queue) = queueptr(tempo->queue); if (queue == NULL) return -EINVAL; memset(tempo, 0, sizeof(*tempo)); tempo->queue = queue->queue; tmr = queue->timer; tempo->tempo = tmr->tempo; tempo->ppq = tmr->ppq; tempo->skew_value = tmr->skew; tempo->skew_base = tmr->skew_base; if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 4)) tempo->tempo_base = tmr->tempo_base; return 0; } /* SET_QUEUE_TEMPO ioctl() */ int snd_seq_set_queue_tempo(int client, struct snd_seq_queue_tempo *tempo) { if (!snd_seq_queue_check_access(tempo->queue, client)) return -EPERM; return snd_seq_queue_timer_set_tempo(tempo->queue, client, tempo); } EXPORT_SYMBOL(snd_seq_set_queue_tempo); static int snd_seq_ioctl_set_queue_tempo(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_tempo *tempo = arg; int result; if (client->user_pversion < SNDRV_PROTOCOL_VERSION(1, 0, 4)) tempo->tempo_base = 0; result = snd_seq_set_queue_tempo(client->number, tempo); return result < 0 ? result : 0; } /* GET_QUEUE_TIMER ioctl() */ static int snd_seq_ioctl_get_queue_timer(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_timer *timer = arg; struct snd_seq_timer *tmr; struct snd_seq_queue *queue __free(snd_seq_queue) = queueptr(timer->queue); if (queue == NULL) return -EINVAL; guard(mutex)(&queue->timer_mutex); tmr = queue->timer; memset(timer, 0, sizeof(*timer)); timer->queue = queue->queue; timer->type = tmr->type; if (tmr->type == SNDRV_SEQ_TIMER_ALSA) { timer->u.alsa.id = tmr->alsa_id; timer->u.alsa.resolution = tmr->preferred_resolution; } return 0; } /* SET_QUEUE_TIMER ioctl() */ static int snd_seq_ioctl_set_queue_timer(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_timer *timer = arg; int result = 0; if (timer->type != SNDRV_SEQ_TIMER_ALSA) return -EINVAL; if (snd_seq_queue_check_access(timer->queue, client->number)) { struct snd_seq_timer *tmr; struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(timer->queue); if (q == NULL) return -ENXIO; guard(mutex)(&q->timer_mutex); tmr = q->timer; snd_seq_queue_timer_close(timer->queue); tmr->type = timer->type; if (tmr->type == SNDRV_SEQ_TIMER_ALSA) { tmr->alsa_id = timer->u.alsa.id; tmr->preferred_resolution = timer->u.alsa.resolution; } result = snd_seq_queue_timer_open(timer->queue); } else { return -EPERM; } return result; } /* GET_QUEUE_CLIENT ioctl() */ static int snd_seq_ioctl_get_queue_client(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_client *info = arg; int used; used = snd_seq_queue_is_used(info->queue, client->number); if (used < 0) return -EINVAL; info->used = used; info->client = client->number; return 0; } /* SET_QUEUE_CLIENT ioctl() */ static int snd_seq_ioctl_set_queue_client(struct snd_seq_client *client, void *arg) { struct snd_seq_queue_client *info = arg; int err; if (info->used >= 0) { err = snd_seq_queue_use(info->queue, client->number, info->used); if (err < 0) return err; } return snd_seq_ioctl_get_queue_client(client, arg); } /* GET_CLIENT_POOL ioctl() */ static int snd_seq_ioctl_get_client_pool(struct snd_seq_client *client, void *arg) { struct snd_seq_client_pool *info = arg; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->client); if (cptr == NULL) return -ENOENT; memset(info, 0, sizeof(*info)); info->client = cptr->number; info->output_pool = cptr->pool->size; info->output_room = cptr->pool->room; info->output_free = info->output_pool; info->output_free = snd_seq_unused_cells(cptr->pool); if (cptr->type == USER_CLIENT) { info->input_pool = cptr->data.user.fifo_pool_size; info->input_free = info->input_pool; info->input_free = snd_seq_fifo_unused_cells(cptr->data.user.fifo); } else { info->input_pool = 0; info->input_free = 0; } return 0; } /* SET_CLIENT_POOL ioctl() */ static int snd_seq_ioctl_set_client_pool(struct snd_seq_client *client, void *arg) { struct snd_seq_client_pool *info = arg; int rc; if (client->number != info->client) return -EINVAL; /* can't change other clients */ if (info->output_pool >= 1 && info->output_pool <= SNDRV_SEQ_MAX_EVENTS && (! snd_seq_write_pool_allocated(client) || info->output_pool != client->pool->size)) { if (snd_seq_write_pool_allocated(client)) { /* is the pool in use? */ if (atomic_read(&client->pool->counter)) return -EBUSY; /* remove all existing cells */ snd_seq_pool_mark_closing(client->pool); snd_seq_pool_done(client->pool); } client->pool->size = info->output_pool; rc = snd_seq_pool_init(client->pool); if (rc < 0) return rc; } if (client->type == USER_CLIENT && client->data.user.fifo != NULL && info->input_pool >= 1 && info->input_pool <= SNDRV_SEQ_MAX_CLIENT_EVENTS && info->input_pool != client->data.user.fifo_pool_size) { /* change pool size */ rc = snd_seq_fifo_resize(client->data.user.fifo, info->input_pool); if (rc < 0) return rc; client->data.user.fifo_pool_size = info->input_pool; } if (info->output_room >= 1 && info->output_room <= client->pool->size) { client->pool->room = info->output_room; } return snd_seq_ioctl_get_client_pool(client, arg); } /* REMOVE_EVENTS ioctl() */ static int snd_seq_ioctl_remove_events(struct snd_seq_client *client, void *arg) { struct snd_seq_remove_events *info = arg; /* * Input mostly not implemented XXX. */ if (info->remove_mode & SNDRV_SEQ_REMOVE_INPUT) { /* * No restrictions so for a user client we can clear * the whole fifo */ if (client->type == USER_CLIENT && client->data.user.fifo) snd_seq_fifo_clear(client->data.user.fifo); } if (info->remove_mode & SNDRV_SEQ_REMOVE_OUTPUT) snd_seq_queue_remove_cells(client->number, info); return 0; } /* * get subscription info */ static int snd_seq_ioctl_get_subscription(struct snd_seq_client *client, void *arg) { struct snd_seq_port_subscribe *subs = arg; struct snd_seq_client *sender __free(snd_seq_client) = client_load_and_use_ptr(subs->sender.client); if (!sender) return -EINVAL; struct snd_seq_client_port *sport __free(snd_seq_port) = snd_seq_port_use_ptr(sender, subs->sender.port); if (!sport) return -EINVAL; return snd_seq_port_get_subscription(&sport->c_src, &subs->dest, subs); } /* * get subscription info - check only its presence */ static int snd_seq_ioctl_query_subs(struct snd_seq_client *client, void *arg) { struct snd_seq_query_subs *subs = arg; struct snd_seq_port_subs_info *group; struct list_head *p; int i; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(subs->root.client); if (!cptr) return -ENXIO; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_use_ptr(cptr, subs->root.port); if (!port) return -ENXIO; switch (subs->type) { case SNDRV_SEQ_QUERY_SUBS_READ: group = &port->c_src; break; case SNDRV_SEQ_QUERY_SUBS_WRITE: group = &port->c_dest; break; default: return -ENXIO; } guard(rwsem_read)(&group->list_mutex); /* search for the subscriber */ subs->num_subs = group->count; i = 0; list_for_each(p, &group->list_head) { if (i++ == subs->index) { /* found! */ struct snd_seq_subscribers *s; if (subs->type == SNDRV_SEQ_QUERY_SUBS_READ) { s = list_entry(p, struct snd_seq_subscribers, src_list); subs->addr = s->info.dest; } else { s = list_entry(p, struct snd_seq_subscribers, dest_list); subs->addr = s->info.sender; } subs->flags = s->info.flags; subs->queue = s->info.queue; return 0; } } return -ENOENT; } /* * query next client */ static int snd_seq_ioctl_query_next_client(struct snd_seq_client *client, void *arg) { struct snd_seq_client_info *info = arg; /* search for next client */ if (info->client < INT_MAX) info->client++; if (info->client < 0) info->client = 0; for (; info->client < SNDRV_SEQ_MAX_CLIENTS; info->client++) { struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->client); if (cptr) { get_client_info(cptr, info); return 0; /* found */ } } return -ENOENT; } /* * query next port */ static int snd_seq_ioctl_query_next_port(struct snd_seq_client *client, void *arg) { struct snd_seq_port_info *info = arg; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; /* search for next port */ info->addr.port++; struct snd_seq_client_port *port __free(snd_seq_port) = snd_seq_port_query_nearest(cptr, info); if (port == NULL) return -ENOENT; /* get port info */ info->addr = port->addr; snd_seq_get_port_info(port, info); return 0; } #if IS_ENABLED(CONFIG_SND_SEQ_UMP) #define NUM_UMP_INFOS (SNDRV_UMP_MAX_BLOCKS + 1) static void free_ump_info(struct snd_seq_client *client) { int i; if (!client->ump_info) return; for (i = 0; i < NUM_UMP_INFOS; i++) kfree(client->ump_info[i]); kfree(client->ump_info); client->ump_info = NULL; } static void terminate_ump_info_strings(void *p, int type) { if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) { struct snd_ump_endpoint_info *ep = p; ep->name[sizeof(ep->name) - 1] = 0; } else { struct snd_ump_block_info *bp = p; bp->name[sizeof(bp->name) - 1] = 0; } } #ifdef CONFIG_SND_PROC_FS static void dump_ump_info(struct snd_info_buffer *buffer, struct snd_seq_client *client) { struct snd_ump_endpoint_info *ep; struct snd_ump_block_info *bp; int i; if (!client->ump_info) return; ep = client->ump_info[SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT]; if (ep && *ep->name) snd_iprintf(buffer, " UMP Endpoint: \"%s\"\n", ep->name); for (i = 0; i < SNDRV_UMP_MAX_BLOCKS; i++) { bp = client->ump_info[i + 1]; if (bp && *bp->name) { snd_iprintf(buffer, " UMP Block %d: \"%s\" [%s]\n", i, bp->name, bp->active ? "Active" : "Inactive"); snd_iprintf(buffer, " Groups: %d-%d\n", bp->first_group + 1, bp->first_group + bp->num_groups); } } } #endif /* UMP-specific ioctls -- called directly without data copy */ static int snd_seq_ioctl_client_ump_info(struct snd_seq_client *caller, unsigned int cmd, unsigned long arg) { struct snd_seq_client_ump_info __user *argp = (struct snd_seq_client_ump_info __user *)arg; int client, type, err = 0; size_t size; void *p; if (get_user(client, &argp->client) || get_user(type, &argp->type)) return -EFAULT; if (cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO && caller->number != client) return -EPERM; if (type < 0 || type >= NUM_UMP_INFOS) return -EINVAL; if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) size = sizeof(struct snd_ump_endpoint_info); else size = sizeof(struct snd_ump_block_info); struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(client); if (!cptr) return -ENOENT; scoped_guard(mutex, &cptr->ioctl_mutex) { if (!cptr->midi_version) { err = -EBADFD; break; } if (cmd == SNDRV_SEQ_IOCTL_GET_CLIENT_UMP_INFO) { if (!cptr->ump_info) p = NULL; else p = cptr->ump_info[type]; if (!p) { err = -ENODEV; break; } if (copy_to_user(argp->info, p, size)) { err = -EFAULT; break; } } else { if (cptr->type != USER_CLIENT) { err = -EBADFD; break; } if (!cptr->ump_info) { cptr->ump_info = kcalloc(NUM_UMP_INFOS, sizeof(void *), GFP_KERNEL); if (!cptr->ump_info) { err = -ENOMEM; break; } } p = memdup_user(argp->info, size); if (IS_ERR(p)) { err = PTR_ERR(p); break; } kfree(cptr->ump_info[type]); terminate_ump_info_strings(p, type); cptr->ump_info[type] = p; } } if (!err && cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO) { if (type == SNDRV_SEQ_CLIENT_UMP_INFO_ENDPOINT) snd_seq_system_ump_notify(client, 0, SNDRV_SEQ_EVENT_UMP_EP_CHANGE, false); else snd_seq_system_ump_notify(client, type - 1, SNDRV_SEQ_EVENT_UMP_BLOCK_CHANGE, false); } return err; } #endif /* -------------------------------------------------------- */ static const struct ioctl_handler { unsigned int cmd; int (*func)(struct snd_seq_client *client, void *arg); } ioctl_handlers[] = { { SNDRV_SEQ_IOCTL_PVERSION, snd_seq_ioctl_pversion }, { SNDRV_SEQ_IOCTL_USER_PVERSION, snd_seq_ioctl_user_pversion }, { SNDRV_SEQ_IOCTL_CLIENT_ID, snd_seq_ioctl_client_id }, { SNDRV_SEQ_IOCTL_SYSTEM_INFO, snd_seq_ioctl_system_info }, { SNDRV_SEQ_IOCTL_RUNNING_MODE, snd_seq_ioctl_running_mode }, { SNDRV_SEQ_IOCTL_GET_CLIENT_INFO, snd_seq_ioctl_get_client_info }, { SNDRV_SEQ_IOCTL_SET_CLIENT_INFO, snd_seq_ioctl_set_client_info }, { SNDRV_SEQ_IOCTL_CREATE_PORT, snd_seq_ioctl_create_port }, { SNDRV_SEQ_IOCTL_DELETE_PORT, snd_seq_ioctl_delete_port }, { SNDRV_SEQ_IOCTL_GET_PORT_INFO, snd_seq_ioctl_get_port_info }, { SNDRV_SEQ_IOCTL_SET_PORT_INFO, snd_seq_ioctl_set_port_info }, { SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, snd_seq_ioctl_subscribe_port }, { SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, snd_seq_ioctl_unsubscribe_port }, { SNDRV_SEQ_IOCTL_CREATE_QUEUE, snd_seq_ioctl_create_queue }, { SNDRV_SEQ_IOCTL_DELETE_QUEUE, snd_seq_ioctl_delete_queue }, { SNDRV_SEQ_IOCTL_GET_QUEUE_INFO, snd_seq_ioctl_get_queue_info }, { SNDRV_SEQ_IOCTL_SET_QUEUE_INFO, snd_seq_ioctl_set_queue_info }, { SNDRV_SEQ_IOCTL_GET_NAMED_QUEUE, snd_seq_ioctl_get_named_queue }, { SNDRV_SEQ_IOCTL_GET_QUEUE_STATUS, snd_seq_ioctl_get_queue_status }, { SNDRV_SEQ_IOCTL_GET_QUEUE_TEMPO, snd_seq_ioctl_get_queue_tempo }, { SNDRV_SEQ_IOCTL_SET_QUEUE_TEMPO, snd_seq_ioctl_set_queue_tempo }, { SNDRV_SEQ_IOCTL_GET_QUEUE_TIMER, snd_seq_ioctl_get_queue_timer }, { SNDRV_SEQ_IOCTL_SET_QUEUE_TIMER, snd_seq_ioctl_set_queue_timer }, { SNDRV_SEQ_IOCTL_GET_QUEUE_CLIENT, snd_seq_ioctl_get_queue_client }, { SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT, snd_seq_ioctl_set_queue_client }, { SNDRV_SEQ_IOCTL_GET_CLIENT_POOL, snd_seq_ioctl_get_client_pool }, { SNDRV_SEQ_IOCTL_SET_CLIENT_POOL, snd_seq_ioctl_set_client_pool }, { SNDRV_SEQ_IOCTL_GET_SUBSCRIPTION, snd_seq_ioctl_get_subscription }, { SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, snd_seq_ioctl_query_next_client }, { SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, snd_seq_ioctl_query_next_port }, { SNDRV_SEQ_IOCTL_REMOVE_EVENTS, snd_seq_ioctl_remove_events }, { SNDRV_SEQ_IOCTL_QUERY_SUBS, snd_seq_ioctl_query_subs }, { 0, NULL }, }; static long snd_seq_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_seq_client *client = file->private_data; /* To use kernel stack for ioctl data. */ union { int pversion; int client_id; struct snd_seq_system_info system_info; struct snd_seq_running_info running_info; struct snd_seq_client_info client_info; struct snd_seq_port_info port_info; struct snd_seq_port_subscribe port_subscribe; struct snd_seq_queue_info queue_info; struct snd_seq_queue_status queue_status; struct snd_seq_queue_tempo tempo; struct snd_seq_queue_timer queue_timer; struct snd_seq_queue_client queue_client; struct snd_seq_client_pool client_pool; struct snd_seq_remove_events remove_events; struct snd_seq_query_subs query_subs; } buf; const struct ioctl_handler *handler; unsigned long size; int err; if (snd_BUG_ON(!client)) return -ENXIO; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) /* exception - handling large data */ switch (cmd) { case SNDRV_SEQ_IOCTL_GET_CLIENT_UMP_INFO: case SNDRV_SEQ_IOCTL_SET_CLIENT_UMP_INFO: return snd_seq_ioctl_client_ump_info(client, cmd, arg); } #endif for (handler = ioctl_handlers; handler->cmd > 0; ++handler) { if (handler->cmd == cmd) break; } if (handler->cmd == 0) return -ENOTTY; memset(&buf, 0, sizeof(buf)); /* * All of ioctl commands for ALSA sequencer get an argument of size * within 13 bits. We can safely pick up the size from the command. */ size = _IOC_SIZE(handler->cmd); if (handler->cmd & IOC_IN) { if (copy_from_user(&buf, (const void __user *)arg, size)) return -EFAULT; } scoped_guard(mutex, &client->ioctl_mutex) { err = handler->func(client, &buf); } if (err >= 0) { /* Some commands includes a bug in 'dir' field. */ if (handler->cmd == SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT || handler->cmd == SNDRV_SEQ_IOCTL_SET_CLIENT_POOL || (handler->cmd & IOC_OUT)) if (copy_to_user((void __user *)arg, &buf, size)) return -EFAULT; } return err; } #ifdef CONFIG_COMPAT #include "seq_compat.c" #else #define snd_seq_ioctl_compat NULL #endif /* -------------------------------------------------------- */ /* exported to kernel modules */ int snd_seq_create_kernel_client(struct snd_card *card, int client_index, const char *name_fmt, ...) { struct snd_seq_client *client; va_list args; if (snd_BUG_ON(in_interrupt())) return -EBUSY; if (card && client_index >= SNDRV_SEQ_CLIENTS_PER_CARD) return -EINVAL; if (card == NULL && client_index >= SNDRV_SEQ_GLOBAL_CLIENTS) return -EINVAL; scoped_guard(mutex, &register_mutex) { if (card) { client_index += SNDRV_SEQ_GLOBAL_CLIENTS + card->number * SNDRV_SEQ_CLIENTS_PER_CARD; if (client_index >= SNDRV_SEQ_DYNAMIC_CLIENTS_BEGIN) client_index = -1; } /* empty write queue as default */ client = seq_create_client1(client_index, 0); if (client == NULL) return -EBUSY; /* failure code */ usage_alloc(&client_usage, 1); client->accept_input = 1; client->accept_output = 1; client->data.kernel.card = card; client->user_pversion = SNDRV_SEQ_VERSION; va_start(args, name_fmt); vsnprintf(client->name, sizeof(client->name), name_fmt, args); va_end(args); client->type = KERNEL_CLIENT; } /* make others aware this new client */ snd_seq_system_client_ev_client_start(client->number); /* return client number to caller */ return client->number; } EXPORT_SYMBOL(snd_seq_create_kernel_client); /* exported to kernel modules */ int snd_seq_delete_kernel_client(int client) { struct snd_seq_client *ptr; if (snd_BUG_ON(in_interrupt())) return -EBUSY; ptr = clientptr(client); if (ptr == NULL) return -EINVAL; seq_free_client(ptr); kfree(ptr); return 0; } EXPORT_SYMBOL(snd_seq_delete_kernel_client); /* * exported, called by kernel clients to enqueue events (w/o blocking) * * RETURN VALUE: zero if succeed, negative if error */ int snd_seq_kernel_client_enqueue(int client, struct snd_seq_event *ev, struct file *file, bool blocking) { if (snd_BUG_ON(!ev)) return -EINVAL; if (!snd_seq_ev_is_ump(ev)) { if (ev->type == SNDRV_SEQ_EVENT_NONE) return 0; /* ignore this */ if (ev->type == SNDRV_SEQ_EVENT_KERNEL_ERROR) return -EINVAL; /* quoted events can't be enqueued */ } /* fill in client number */ ev->source.client = client; if (check_event_type_and_length(ev)) return -EINVAL; struct snd_seq_client *cptr __free(snd_seq_client) = client_load_and_use_ptr(client); if (cptr == NULL) return -EINVAL; if (!cptr->accept_output) { return -EPERM; } else { /* send it */ guard(mutex)(&cptr->ioctl_mutex); return snd_seq_client_enqueue_event(cptr, ev, file, blocking, false, 0, &cptr->ioctl_mutex); } } EXPORT_SYMBOL(snd_seq_kernel_client_enqueue); /* * exported, called by kernel clients to dispatch events directly to other * clients, bypassing the queues. Event time-stamp will be updated. * * RETURN VALUE: negative = delivery failed, * zero, or positive: the number of delivered events */ int snd_seq_kernel_client_dispatch(int client, struct snd_seq_event * ev, int atomic, int hop) { if (snd_BUG_ON(!ev)) return -EINVAL; /* fill in client number */ ev->queue = SNDRV_SEQ_QUEUE_DIRECT; ev->source.client = client; if (check_event_type_and_length(ev)) return -EINVAL; struct snd_seq_client *cptr __free(snd_seq_client) = snd_seq_client_use_ptr(client); if (cptr == NULL) return -EINVAL; if (!cptr->accept_output) return -EPERM; else return snd_seq_deliver_event(cptr, ev, atomic, hop); } EXPORT_SYMBOL(snd_seq_kernel_client_dispatch); static int call_seq_client_ctl(struct snd_seq_client *client, unsigned int cmd, void *arg) { const struct ioctl_handler *handler; for (handler = ioctl_handlers; handler->cmd > 0; ++handler) { if (handler->cmd == cmd) return handler->func(client, arg); } pr_debug("ALSA: seq unknown ioctl() 0x%x (type='%c', number=0x%02x)\n", cmd, _IOC_TYPE(cmd), _IOC_NR(cmd)); return -ENOTTY; } /** * snd_seq_kernel_client_ctl - operate a command for a client with data in * kernel space. * @clientid: A numerical ID for a client. * @cmd: An ioctl(2) command for ALSA sequencer operation. * @arg: A pointer to data in kernel space. * * Against its name, both kernel/application client can be handled by this * kernel API. A pointer of 'arg' argument should be in kernel space. * * Return: 0 at success. Negative error code at failure. */ int snd_seq_kernel_client_ctl(int clientid, unsigned int cmd, void *arg) { struct snd_seq_client *client; client = clientptr(clientid); if (client == NULL) return -ENXIO; return call_seq_client_ctl(client, cmd, arg); } EXPORT_SYMBOL(snd_seq_kernel_client_ctl); /* a similar like above but taking locks; used only from OSS sequencer layer */ int snd_seq_kernel_client_ioctl(int clientid, unsigned int cmd, void *arg) { struct snd_seq_client *client __free(snd_seq_client) = client_load_and_use_ptr(clientid); if (!client) return -ENXIO; guard(mutex)(&client->ioctl_mutex); return call_seq_client_ctl(client, cmd, arg); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_ioctl); /* exported (for OSS emulator) */ int snd_seq_kernel_client_write_poll(int clientid, struct file *file, poll_table *wait) { struct snd_seq_client *client; client = clientptr(clientid); if (client == NULL) return -ENXIO; if (snd_seq_pool_poll_wait(client->pool, file, wait)) return 1; return 0; } EXPORT_SYMBOL(snd_seq_kernel_client_write_poll); /* get a sequencer client object; for internal use from a kernel client */ struct snd_seq_client *snd_seq_kernel_client_get(int id) { return snd_seq_client_use_ptr(id); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_get); /* put a sequencer client object; for internal use from a kernel client */ void snd_seq_kernel_client_put(struct snd_seq_client *cptr) { if (cptr) snd_seq_client_unref(cptr); } EXPORT_SYMBOL_GPL(snd_seq_kernel_client_put); /*---------------------------------------------------------------------------*/ #ifdef CONFIG_SND_PROC_FS /* * /proc interface */ static void snd_seq_info_dump_subscribers(struct snd_info_buffer *buffer, struct snd_seq_port_subs_info *group, int is_src, char *msg) { struct list_head *p; struct snd_seq_subscribers *s; int count = 0; guard(rwsem_read)(&group->list_mutex); if (list_empty(&group->list_head)) return; snd_iprintf(buffer, msg); list_for_each(p, &group->list_head) { if (is_src) s = list_entry(p, struct snd_seq_subscribers, src_list); else s = list_entry(p, struct snd_seq_subscribers, dest_list); if (count++) snd_iprintf(buffer, ", "); snd_iprintf(buffer, "%d:%d", is_src ? s->info.dest.client : s->info.sender.client, is_src ? s->info.dest.port : s->info.sender.port); if (s->info.flags & SNDRV_SEQ_PORT_SUBS_TIMESTAMP) snd_iprintf(buffer, "[%c:%d]", ((s->info.flags & SNDRV_SEQ_PORT_SUBS_TIME_REAL) ? 'r' : 't'), s->info.queue); if (group->exclusive) snd_iprintf(buffer, "[ex]"); } snd_iprintf(buffer, "\n"); } #define FLAG_PERM_RD(perm) ((perm) & SNDRV_SEQ_PORT_CAP_READ ? ((perm) & SNDRV_SEQ_PORT_CAP_SUBS_READ ? 'R' : 'r') : '-') #define FLAG_PERM_WR(perm) ((perm) & SNDRV_SEQ_PORT_CAP_WRITE ? ((perm) & SNDRV_SEQ_PORT_CAP_SUBS_WRITE ? 'W' : 'w') : '-') #define FLAG_PERM_EX(perm) ((perm) & SNDRV_SEQ_PORT_CAP_NO_EXPORT ? '-' : 'e') #define FLAG_PERM_DUPLEX(perm) ((perm) & SNDRV_SEQ_PORT_CAP_DUPLEX ? 'X' : '-') static const char *port_direction_name(unsigned char dir) { static const char *names[4] = { "-", "In", "Out", "In/Out" }; if (dir > SNDRV_SEQ_PORT_DIR_BIDIRECTION) return "Invalid"; return names[dir]; } static void snd_seq_info_dump_ports(struct snd_info_buffer *buffer, struct snd_seq_client *client) { struct snd_seq_client_port *p; guard(mutex)(&client->ports_mutex); list_for_each_entry(p, &client->ports_list_head, list) { if (p->capability & SNDRV_SEQ_PORT_CAP_INACTIVE) continue; snd_iprintf(buffer, " Port %3d : \"%s\" (%c%c%c%c) [%s]", p->addr.port, p->name, FLAG_PERM_RD(p->capability), FLAG_PERM_WR(p->capability), FLAG_PERM_EX(p->capability), FLAG_PERM_DUPLEX(p->capability), port_direction_name(p->direction)); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) if (snd_seq_client_is_midi2(client) && p->is_midi1) snd_iprintf(buffer, " [MIDI1]"); #endif snd_iprintf(buffer, "\n"); snd_seq_info_dump_subscribers(buffer, &p->c_src, 1, " Connecting To: "); snd_seq_info_dump_subscribers(buffer, &p->c_dest, 0, " Connected From: "); } } static const char *midi_version_string(unsigned int version) { switch (version) { case SNDRV_SEQ_CLIENT_LEGACY_MIDI: return "Legacy"; case SNDRV_SEQ_CLIENT_UMP_MIDI_1_0: return "UMP MIDI1"; case SNDRV_SEQ_CLIENT_UMP_MIDI_2_0: return "UMP MIDI2"; default: return "Unknown"; } } /* exported to seq_info.c */ void snd_seq_info_clients_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int c; snd_iprintf(buffer, "Client info\n"); snd_iprintf(buffer, " cur clients : %d\n", client_usage.cur); snd_iprintf(buffer, " peak clients : %d\n", client_usage.peak); snd_iprintf(buffer, " max clients : %d\n", SNDRV_SEQ_MAX_CLIENTS); snd_iprintf(buffer, "\n"); /* list the client table */ for (c = 0; c < SNDRV_SEQ_MAX_CLIENTS; c++) { struct snd_seq_client *client __free(snd_seq_client) = client_load_and_use_ptr(c); if (client == NULL) continue; if (client->type == NO_CLIENT) continue; guard(mutex)(&client->ioctl_mutex); snd_iprintf(buffer, "Client %3d : \"%s\" [%s %s]\n", c, client->name, client->type == USER_CLIENT ? "User" : "Kernel", midi_version_string(client->midi_version)); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) dump_ump_info(buffer, client); #endif snd_seq_info_dump_ports(buffer, client); if (snd_seq_write_pool_allocated(client)) { snd_iprintf(buffer, " Output pool :\n"); snd_seq_info_pool(buffer, client->pool, " "); } if (client->type == USER_CLIENT && client->data.user.fifo && client->data.user.fifo->pool) { snd_iprintf(buffer, " Input pool :\n"); snd_seq_info_pool(buffer, client->data.user.fifo->pool, " "); } } } #endif /* CONFIG_SND_PROC_FS */ /*---------------------------------------------------------------------------*/ /* * REGISTRATION PART */ static const struct file_operations snd_seq_f_ops = { .owner = THIS_MODULE, .read = snd_seq_read, .write = snd_seq_write, .open = snd_seq_open, .release = snd_seq_release, .poll = snd_seq_poll, .unlocked_ioctl = snd_seq_ioctl, .compat_ioctl = snd_seq_ioctl_compat, }; static struct device *seq_dev; /* * register sequencer device */ int __init snd_sequencer_device_init(void) { int err; err = snd_device_alloc(&seq_dev, NULL); if (err < 0) return err; dev_set_name(seq_dev, "seq"); scoped_guard(mutex, &register_mutex) { err = snd_register_device(SNDRV_DEVICE_TYPE_SEQUENCER, NULL, 0, &snd_seq_f_ops, NULL, seq_dev); } if (err < 0) { put_device(seq_dev); return err; } return 0; } /* * unregister sequencer device */ void snd_sequencer_device_done(void) { snd_unregister_device(seq_dev); put_device(seq_dev); }
20 20 20 20 16 9 7 6 6 20 4 5 5 5 5 1 1 5 4 4 4 5 5 5 5 5 4 4 4 2 1 1 1 5 5 4 4 5 2 7 6 7 4 4 2 2 6 5 4 3 2 1 2 2 15 14 15 12 4 11 1 10 10 8 2 1 2 3 3 1 41 40 8 4 4 4 4 36 35 41 41 96 97 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 // SPDX-License-Identifier: GPL-2.0-only /* * Crypto user configuration API. * * Copyright (C) 2011 secunet Security Networks AG * Copyright (C) 2011 Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/crypto.h> #include <linux/cryptouser.h> #include <linux/sched.h> #include <linux/security.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <crypto/internal/skcipher.h> #include <crypto/internal/rng.h> #include <crypto/akcipher.h> #include <crypto/kpp.h> #include "internal.h" #define null_terminated(x) (strnlen(x, sizeof(x)) < sizeof(x)) static DEFINE_MUTEX(crypto_cfg_mutex); struct crypto_dump_info { struct sk_buff *in_skb; struct sk_buff *out_skb; u32 nlmsg_seq; u16 nlmsg_flags; }; static struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact) { struct crypto_alg *q, *alg = NULL; down_read(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { int match = 0; if (crypto_is_larval(q)) continue; if ((q->cra_flags ^ p->cru_type) & p->cru_mask) continue; if (strlen(p->cru_driver_name)) match = !strcmp(q->cra_driver_name, p->cru_driver_name); else if (!exact) match = !strcmp(q->cra_name, p->cru_name); if (!match) continue; if (unlikely(!crypto_mod_get(q))) continue; alg = q; break; } up_read(&crypto_alg_sem); return alg; } static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_cipher rcipher; memset(&rcipher, 0, sizeof(rcipher)); strscpy(rcipher.type, "cipher", sizeof(rcipher.type)); rcipher.blocksize = alg->cra_blocksize; rcipher.min_keysize = alg->cra_cipher.cia_min_keysize; rcipher.max_keysize = alg->cra_cipher.cia_max_keysize; return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER, sizeof(rcipher), &rcipher); } static int crypto_report_one(struct crypto_alg *alg, struct crypto_user_alg *ualg, struct sk_buff *skb) { memset(ualg, 0, sizeof(*ualg)); strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name)); strscpy(ualg->cru_driver_name, alg->cra_driver_name, sizeof(ualg->cru_driver_name)); strscpy(ualg->cru_module_name, module_name(alg->cra_module), sizeof(ualg->cru_module_name)); ualg->cru_type = 0; ualg->cru_mask = 0; ualg->cru_flags = alg->cra_flags; ualg->cru_refcnt = refcount_read(&alg->cra_refcnt); if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority)) goto nla_put_failure; if (alg->cra_flags & CRYPTO_ALG_LARVAL) { struct crypto_report_larval rl; memset(&rl, 0, sizeof(rl)); strscpy(rl.type, "larval", sizeof(rl.type)); if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl)) goto nla_put_failure; goto out; } if (alg->cra_type && alg->cra_type->report) { if (alg->cra_type->report(skb, alg)) goto nla_put_failure; goto out; } switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) { case CRYPTO_ALG_TYPE_CIPHER: if (crypto_report_cipher(skb, alg)) goto nla_put_failure; break; } out: return 0; nla_put_failure: return -EMSGSIZE; } static int crypto_report_alg(struct crypto_alg *alg, struct crypto_dump_info *info) { struct sk_buff *in_skb = info->in_skb; struct sk_buff *skb = info->out_skb; struct nlmsghdr *nlh; struct crypto_user_alg *ualg; int err = 0; nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, info->nlmsg_seq, CRYPTO_MSG_GETALG, sizeof(*ualg), info->nlmsg_flags); if (!nlh) { err = -EMSGSIZE; goto out; } ualg = nlmsg_data(nlh); err = crypto_report_one(alg, ualg, skb); if (err) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); out: return err; } static int crypto_report(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) { struct net *net = sock_net(in_skb->sk); struct crypto_user_alg *p = nlmsg_data(in_nlh); struct crypto_alg *alg; struct sk_buff *skb; struct crypto_dump_info info; int err; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; alg = crypto_alg_match(p, 0); if (!alg) return -ENOENT; err = -ENOMEM; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) goto drop_alg; info.in_skb = in_skb; info.out_skb = skb; info.nlmsg_seq = in_nlh->nlmsg_seq; info.nlmsg_flags = 0; err = crypto_report_alg(alg, &info); drop_alg: crypto_mod_put(alg); if (err) { kfree_skb(skb); return err; } return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid); } static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb) { const size_t start_pos = cb->args[0]; size_t pos = 0; struct crypto_dump_info info; struct crypto_alg *alg; int res; info.in_skb = cb->skb; info.out_skb = skb; info.nlmsg_seq = cb->nlh->nlmsg_seq; info.nlmsg_flags = NLM_F_MULTI; down_read(&crypto_alg_sem); list_for_each_entry(alg, &crypto_alg_list, cra_list) { if (pos >= start_pos) { res = crypto_report_alg(alg, &info); if (res == -EMSGSIZE) break; if (res) goto out; } pos++; } cb->args[0] = pos; res = skb->len; out: up_read(&crypto_alg_sem); return res; } static int crypto_dump_report_done(struct netlink_callback *cb) { return 0; } static int crypto_update_alg(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct crypto_alg *alg; struct crypto_user_alg *p = nlmsg_data(nlh); struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL]; LIST_HEAD(list); if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; if (priority && !strlen(p->cru_driver_name)) return -EINVAL; alg = crypto_alg_match(p, 1); if (!alg) return -ENOENT; down_write(&crypto_alg_sem); crypto_remove_spawns(alg, &list, NULL); if (priority) alg->cra_priority = nla_get_u32(priority); up_write(&crypto_alg_sem); crypto_mod_put(alg); crypto_remove_final(&list); return 0; } static int crypto_del_alg(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { struct crypto_alg *alg; struct crypto_user_alg *p = nlmsg_data(nlh); int err; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; alg = crypto_alg_match(p, 1); if (!alg) return -ENOENT; /* We can not unregister core algorithms such as aes. * We would loose the reference in the crypto_alg_list to this algorithm * if we try to unregister. Unregistering such an algorithm without * removing the module is not possible, so we restrict to crypto * instances that are build from templates. */ err = -EINVAL; if (!(alg->cra_flags & CRYPTO_ALG_INSTANCE)) goto drop_alg; err = -EBUSY; if (refcount_read(&alg->cra_refcnt) > 2) goto drop_alg; crypto_unregister_instance((struct crypto_instance *)alg); err = 0; drop_alg: crypto_mod_put(alg); return err; } static int crypto_add_alg(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { int exact = 0; const char *name; struct crypto_alg *alg; struct crypto_user_alg *p = nlmsg_data(nlh); struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL]; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name)) return -EINVAL; if (strlen(p->cru_driver_name)) exact = 1; if (priority && !exact) return -EINVAL; alg = crypto_alg_match(p, exact); if (alg) { crypto_mod_put(alg); return -EEXIST; } if (strlen(p->cru_driver_name)) name = p->cru_driver_name; else name = p->cru_name; alg = crypto_alg_mod_lookup(name, p->cru_type, p->cru_mask); if (IS_ERR(alg)) return PTR_ERR(alg); down_write(&crypto_alg_sem); if (priority) alg->cra_priority = nla_get_u32(priority); up_write(&crypto_alg_sem); crypto_mod_put(alg); return 0; } static int crypto_del_rng(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; return crypto_del_default_rng(); } static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs) { /* No longer supported */ return -ENOTSUPP; } #define MSGSIZE(type) sizeof(struct type) static const int crypto_msg_min[CRYPTO_NR_MSGTYPES] = { [CRYPTO_MSG_NEWALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_DELALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_UPDATEALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = 0, [CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg), }; static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = { [CRYPTOCFGA_PRIORITY_VAL] = { .type = NLA_U32}, }; #undef MSGSIZE static const struct crypto_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); } crypto_dispatch[CRYPTO_NR_MSGTYPES] = { [CRYPTO_MSG_NEWALG - CRYPTO_MSG_BASE] = { .doit = crypto_add_alg}, [CRYPTO_MSG_DELALG - CRYPTO_MSG_BASE] = { .doit = crypto_del_alg}, [CRYPTO_MSG_UPDATEALG - CRYPTO_MSG_BASE] = { .doit = crypto_update_alg}, [CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE] = { .doit = crypto_report, .dump = crypto_dump_report, .done = crypto_dump_report_done}, [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = { .doit = crypto_del_rng }, [CRYPTO_MSG_GETSTAT - CRYPTO_MSG_BASE] = { .doit = crypto_reportstat}, }; static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[CRYPTOCFGA_MAX+1]; const struct crypto_link *link; int type, err; type = nlh->nlmsg_type; if (type > CRYPTO_MSG_MAX) return -EINVAL; type -= CRYPTO_MSG_BASE; link = &crypto_dispatch[type]; if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) && (nlh->nlmsg_flags & NLM_F_DUMP))) { struct crypto_alg *alg; unsigned long dump_alloc = 0; if (link->dump == NULL) return -EINVAL; down_read(&crypto_alg_sem); list_for_each_entry(alg, &crypto_alg_list, cra_list) dump_alloc += CRYPTO_REPORT_MAXSIZE; up_read(&crypto_alg_sem); { struct netlink_dump_control c = { .dump = link->dump, .done = link->done, .min_dump_alloc = min(dump_alloc, 65535UL), }; err = netlink_dump_start(net->crypto_nlsk, skb, nlh, &c); } return err; } err = nlmsg_parse_deprecated(nlh, crypto_msg_min[type], attrs, CRYPTOCFGA_MAX, crypto_policy, extack); if (err < 0) return err; if (link->doit == NULL) return -EINVAL; return link->doit(skb, nlh, attrs); } static void crypto_netlink_rcv(struct sk_buff *skb) { mutex_lock(&crypto_cfg_mutex); netlink_rcv_skb(skb, &crypto_user_rcv_msg); mutex_unlock(&crypto_cfg_mutex); } static int __net_init crypto_netlink_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = crypto_netlink_rcv, }; net->crypto_nlsk = netlink_kernel_create(net, NETLINK_CRYPTO, &cfg); return net->crypto_nlsk == NULL ? -ENOMEM : 0; } static void __net_exit crypto_netlink_exit(struct net *net) { netlink_kernel_release(net->crypto_nlsk); net->crypto_nlsk = NULL; } static struct pernet_operations crypto_netlink_net_ops = { .init = crypto_netlink_init, .exit = crypto_netlink_exit, }; static int __init crypto_user_init(void) { return register_pernet_subsys(&crypto_netlink_net_ops); } static void __exit crypto_user_exit(void) { unregister_pernet_subsys(&crypto_netlink_net_ops); } module_init(crypto_user_init); module_exit(crypto_user_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Crypto userspace configuration API"); MODULE_ALIAS("net-pf-16-proto-21");
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> * Copyright 2022 Linaro Limited * * Thermal trips handling */ #include "thermal_core.h" static const char *trip_type_names[] = { [THERMAL_TRIP_ACTIVE] = "active", [THERMAL_TRIP_PASSIVE] = "passive", [THERMAL_TRIP_HOT] = "hot", [THERMAL_TRIP_CRITICAL] = "critical", }; const char *thermal_trip_type_name(enum thermal_trip_type trip_type) { if (trip_type < THERMAL_TRIP_ACTIVE || trip_type > THERMAL_TRIP_CRITICAL) return "unknown"; return trip_type_names[trip_type]; } int for_each_thermal_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data) { struct thermal_trip_desc *td; int ret; for_each_trip_desc(tz, td) { ret = cb(&td->trip, data); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(for_each_thermal_trip); int thermal_zone_for_each_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data) { guard(thermal_zone)(tz); return for_each_thermal_trip(tz, cb, data); } EXPORT_SYMBOL_GPL(thermal_zone_for_each_trip); void thermal_zone_set_trips(struct thermal_zone_device *tz, int low, int high) { int ret; lockdep_assert_held(&tz->lock); if (!tz->ops.set_trips) return; /* No need to change trip points */ if (tz->prev_low_trip == low && tz->prev_high_trip == high) return; tz->prev_low_trip = low; tz->prev_high_trip = high; dev_dbg(&tz->device, "new temperature boundaries: %d < x < %d\n", low, high); /* * Set a temperature window. When this window is left the driver * must inform the thermal core via thermal_zone_device_update. */ ret = tz->ops.set_trips(tz, low, high); if (ret) dev_err(&tz->device, "Failed to set trips: %d\n", ret); } int thermal_zone_trip_id(const struct thermal_zone_device *tz, const struct thermal_trip *trip) { /* * Assume the trip to be located within the bounds of the thermal * zone's trips[] table. */ return trip_to_trip_desc(trip) - tz->trips; }
268 267 205 63 265 268 16360 16368 235 137 135 137 1 69 69 69 43 42 43 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .id = &safesetid_lsmid, .init = safesetid_security_init, .initcall_fs = safesetid_init_securityfs, };
3 2 2 2 2 2 3 3 3 2 2 3 3 2 3 3 3 3 3 2 2 3 3 3 3 3 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SM4 Cipher Algorithm, AES-NI/AVX optimized. * as specified in * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html * * Copyright (c) 2021, Alibaba Group. * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ #include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/export.h> #include <linux/kernel.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" #define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, const u8 *src, int nblocks); asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, const u8 *src, int nblocks); asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return sm4_expandkey(ctx, key, key_len); } static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) { struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { sm4_aesni_avx_crypt8(rkey, dst, src, 8); dst += SM4_CRYPT8_BLOCK_SIZE; src += SM4_CRYPT8_BLOCK_SIZE; nbytes -= SM4_CRYPT8_BLOCK_SIZE; } while (nbytes >= SM4_BLOCK_SIZE) { unsigned int nblocks = min(nbytes >> 4, 4u); sm4_aesni_avx_crypt4(rkey, dst, src, nblocks); dst += nblocks * SM4_BLOCK_SIZE; src += nblocks * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } int sm4_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_enc); } EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt); int sm4_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_dec); } EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt); int sm4_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *iv = walk.iv; const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; while (nbytes >= SM4_BLOCK_SIZE) { crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); sm4_crypt_block(ctx->rkey_enc, dst, dst); iv = dst; src += SM4_BLOCK_SIZE; dst += SM4_BLOCK_SIZE; nbytes -= SM4_BLOCK_SIZE; } if (iv != walk.iv) memcpy(walk.iv, iv, SM4_BLOCK_SIZE); err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_cbc_encrypt); int sm4_avx_cbc_decrypt(struct skcipher_request *req, unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= bsize) { func(ctx->rkey_dec, dst, src, walk.iv); dst += bsize; src += bsize; nbytes -= bsize; } while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; u8 iv[SM4_BLOCK_SIZE]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream, src, nblocks); src += ((int)nblocks - 2) * SM4_BLOCK_SIZE; dst += (nblocks - 1) * SM4_BLOCK_SIZE; memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); for (i = nblocks - 1; i > 0; i--) { crypto_xor_cpy(dst, src, &keystream[i * SM4_BLOCK_SIZE], SM4_BLOCK_SIZE); src -= SM4_BLOCK_SIZE; dst -= SM4_BLOCK_SIZE; } crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); memcpy(walk.iv, iv, SM4_BLOCK_SIZE); dst += nblocks * SM4_BLOCK_SIZE; src += (nblocks + 1) * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt); static int cbc_decrypt(struct skcipher_request *req) { return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, sm4_aesni_avx_cbc_dec_blk8); } int sm4_avx_ctr_crypt(struct skcipher_request *req, unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= bsize) { func(ctx->rkey_enc, dst, src, walk.iv); dst += bsize; src += bsize; nbytes -= bsize; } while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; for (i = 0; i < nblocks; i++) { memcpy(&keystream[i * SM4_BLOCK_SIZE], walk.iv, SM4_BLOCK_SIZE); crypto_inc(walk.iv, SM4_BLOCK_SIZE); } sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, keystream, nblocks); crypto_xor_cpy(dst, src, keystream, nblocks * SM4_BLOCK_SIZE); dst += nblocks * SM4_BLOCK_SIZE; src += nblocks * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); /* tail */ if (walk.nbytes == walk.total && nbytes > 0) { u8 keystream[SM4_BLOCK_SIZE]; memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); crypto_inc(walk.iv, SM4_BLOCK_SIZE); sm4_crypt_block(ctx->rkey_enc, keystream, keystream); crypto_xor_cpy(dst, src, keystream, nbytes); dst += nbytes; src += nbytes; nbytes = 0; } err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt); static int ctr_crypt(struct skcipher_request *req) { return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE, sm4_aesni_avx_ctr_enc_blk8); } static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { .base = { .cra_name = "ecb(sm4)", .cra_driver_name = "ecb-sm4-aesni-avx", .cra_priority = 400, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = sm4_avx_ecb_encrypt, .decrypt = sm4_avx_ecb_decrypt, }, { .base = { .cra_name = "cbc(sm4)", .cra_driver_name = "cbc-sm4-aesni-avx", .cra_priority = 400, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .ivsize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = sm4_cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { .cra_name = "ctr(sm4)", .cra_driver_name = "ctr-sm4-aesni-avx", .cra_priority = 400, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .ivsize = SM4_BLOCK_SIZE, .chunksize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, } }; static int __init sm4_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return crypto_register_skciphers(sm4_aesni_avx_skciphers, ARRAY_SIZE(sm4_aesni_avx_skciphers)); } static void __exit sm4_exit(void) { crypto_unregister_skciphers(sm4_aesni_avx_skciphers, ARRAY_SIZE(sm4_aesni_avx_skciphers)); } module_init(sm4_init); module_exit(sm4_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized"); MODULE_ALIAS_CRYPTO("sm4"); MODULE_ALIAS_CRYPTO("sm4-aesni-avx");
134 134 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2025 Zsolt Kajtar (soci@c64.rulez.org) */ #include <linux/export.h> #include <linux/module.h> #include <linux/fb.h> #include <linux/bitrev.h> #include <asm/types.h> #ifdef CONFIG_FB_SYS_REV_PIXELS_IN_BYTE #define FB_REV_PIXELS_IN_BYTE #endif #include "sysmem.h" #include "fb_imageblit.h" void sys_imageblit(struct fb_info *p, const struct fb_image *image) { if (!(p->flags & FBINFO_VIRTFB)) fb_warn_once(p, "%s: framebuffer is not in virtual address space.\n", __func__); fb_imageblit(p, image); } EXPORT_SYMBOL(sys_imageblit); MODULE_AUTHOR("Zsolt Kajtar <soci@c64.rulez.org>"); MODULE_DESCRIPTION("Virtual memory packed pixel framebuffer image draw"); MODULE_LICENSE("GPL");
32 32 32 12 32 32 32 30 11 30 30 30 30 30 30 30 30 29 11 11 11 10 11 11 11 11 11 3 10 11 11 11 11 11 10 11 11 10 11 1 10 8 3 2 1 9 11 9 9 1 11 3 11 10 11 11 3 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 10 12 12 12 11 1 12 4 12 3 3 3 3 3 3 3 3 3 3 3 3 9 9 9 8 8 8 8 3 8 8 8 4 9 7 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 // SPDX-License-Identifier: GPL-2.0-only /* Network filesystem high-level (buffered) writeback. * * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * * To support network filesystems with local caching, we manage a situation * that can be envisioned like the following: * * +---+---+-----+-----+---+----------+ * Folios: | | | | | | | * +---+---+-----+-----+---+----------+ * * +------+------+ +----+----+ * Upload: | | |.....| | | * (Stream 0) +------+------+ +----+----+ * * +------+------+------+------+------+ * Cache: | | | | | | * (Stream 1) +------+------+------+------+------+ * * Where we have a sequence of folios of varying sizes that we need to overlay * with multiple parallel streams of I/O requests, where the I/O requests in a * stream may also be of various sizes (in cifs, for example, the sizes are * negotiated with the server; in something like ceph, they may represent the * sizes of storage objects). * * The sequence in each stream may contain gaps and noncontiguous subrequests * may be glued together into single vectored write RPCs. */ #include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include "internal.h" /* * Kill all dirty folios in the event of an unrecoverable error, starting with * a locked folio we've already obtained from writeback_iter(). */ static void netfs_kill_dirty_pages(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio) { int error = 0; do { enum netfs_folio_trace why = netfs_folio_trace_kill; struct netfs_group *group = NULL; struct netfs_folio *finfo = NULL; void *priv; priv = folio_detach_private(folio); if (priv) { finfo = __netfs_folio_info(priv); if (finfo) { /* Kill folio from streaming write. */ group = finfo->netfs_group; why = netfs_folio_trace_kill_s; } else { group = priv; if (group == NETFS_FOLIO_COPY_TO_CACHE) { /* Kill copy-to-cache folio */ why = netfs_folio_trace_kill_cc; group = NULL; } else { /* Kill folio with group */ why = netfs_folio_trace_kill_g; } } } trace_netfs_folio(folio, why); folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); netfs_put_group(group); kfree(finfo); } while ((folio = writeback_iter(mapping, wbc, folio, &error))); } /* * Create a write request and set it up appropriately for the origin type. */ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, struct file *file, loff_t start, enum netfs_io_origin origin) { struct netfs_io_request *wreq; struct netfs_inode *ictx; bool is_cacheable = (origin == NETFS_WRITEBACK || origin == NETFS_WRITEBACK_SINGLE || origin == NETFS_WRITETHROUGH || origin == NETFS_PGPRIV2_COPY_TO_CACHE); wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) return wreq; _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); if (is_cacheable && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0) goto nomem; wreq->cleaned_to = wreq->start; wreq->io_streams[0].stream_nr = 0; wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER; wreq->io_streams[0].prepare_write = ictx->ops->prepare_write; wreq->io_streams[0].issue_write = ictx->ops->issue_write; wreq->io_streams[0].collected_to = start; wreq->io_streams[0].transferred = 0; wreq->io_streams[1].stream_nr = 1; wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE; wreq->io_streams[1].collected_to = start; wreq->io_streams[1].transferred = 0; if (fscache_resources_valid(&wreq->cache_resources)) { wreq->io_streams[1].avail = true; wreq->io_streams[1].active = true; wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq; wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write; } return wreq; nomem: netfs_put_failed_request(wreq); return ERR_PTR(-ENOMEM); } /** * netfs_prepare_write_failed - Note write preparation failed * @subreq: The subrequest to mark * * Mark a subrequest to note that preparation for write failed. */ void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq) { __set_bit(NETFS_SREQ_FAILED, &subreq->flags); trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed); } EXPORT_SYMBOL(netfs_prepare_write_failed); /* * Prepare a write subrequest. We need to allocate a new subrequest * if we don't have one. */ void netfs_prepare_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream, loff_t start) { struct netfs_io_subrequest *subreq; struct iov_iter *wreq_iter = &wreq->buffer.iter; /* Make sure we don't point the iterator at a used-up folio_queue * struct being used as a placeholder to prevent the queue from * collapsing. In such a case, extend the queue. */ if (iov_iter_is_folioq(wreq_iter) && wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) rolling_buffer_make_space(&wreq->buffer); subreq = netfs_alloc_subrequest(wreq); subreq->source = stream->source; subreq->start = start; subreq->stream_nr = stream->stream_nr; subreq->io_iter = *wreq_iter; _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); stream->sreq_max_len = UINT_MAX; stream->sreq_max_segs = INT_MAX; switch (stream->source) { case NETFS_UPLOAD_TO_SERVER: netfs_stat(&netfs_n_wh_upload); stream->sreq_max_len = wreq->wsize; break; case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write); break; default: WARN_ON_ONCE(1); break; } if (stream->prepare_write) stream->prepare_write(subreq); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); /* We add to the end of the list whilst the collector may be walking * the list. The collector only goes nextwards and uses the lock to * remove entries off of the front. */ spin_lock(&wreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; /* Write list pointers before active flag */ smp_store_release(&stream->active, true); } } spin_unlock(&wreq->lock); stream->construct = subreq; } /* * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O * operation. The operation may be asynchronous and should call * netfs_write_subrequest_terminated() when complete. */ static void netfs_do_issue_write(struct netfs_io_stream *stream, struct netfs_io_subrequest *subreq) { struct netfs_io_request *wreq = subreq->rreq; _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) return netfs_write_subrequest_terminated(subreq, subreq->error); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); stream->issue_write(subreq); } void netfs_reissue_write(struct netfs_io_stream *stream, struct netfs_io_subrequest *subreq, struct iov_iter *source) { size_t size = subreq->len - subreq->transferred; // TODO: Use encrypted buffer subreq->io_iter = *source; iov_iter_advance(source, size); iov_iter_truncate(&subreq->io_iter, size); subreq->retry_count++; subreq->error = 0; __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_stat(&netfs_n_wh_retry_write_subreq); netfs_do_issue_write(stream, subreq); } void netfs_issue_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream) { struct netfs_io_subrequest *subreq = stream->construct; if (!subreq) return; stream->construct = NULL; subreq->io_iter.count = subreq->len; netfs_do_issue_write(stream, subreq); } /* * Add data to the write subrequest, dispatching each as we fill it up or if it * is discontiguous with the previous. We only fill one part at a time so that * we can avoid overrunning the credits obtained (cifs) and try to parallelise * content-crypto preparation with network writes. */ size_t netfs_advance_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream, loff_t start, size_t len, bool to_eof) { struct netfs_io_subrequest *subreq = stream->construct; size_t part; if (!stream->avail) { _leave("no write"); return len; } _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); if (subreq && start != subreq->start + subreq->len) { netfs_issue_write(wreq, stream); subreq = NULL; } if (!stream->construct) netfs_prepare_write(wreq, stream, start); subreq = stream->construct; part = umin(stream->sreq_max_len - subreq->len, len); _debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len); subreq->len += part; subreq->nr_segs++; stream->submit_extendable_to -= part; if (subreq->len >= stream->sreq_max_len || subreq->nr_segs >= stream->sreq_max_segs || to_eof) { netfs_issue_write(wreq, stream); subreq = NULL; } return part; } /* * Write some of a pending folio data back to the server. */ static int netfs_write_folio(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *folio) { struct netfs_io_stream *upload = &wreq->io_streams[0]; struct netfs_io_stream *cache = &wreq->io_streams[1]; struct netfs_io_stream *stream; struct netfs_group *fgroup; /* TODO: Use this with ceph */ struct netfs_folio *finfo; size_t iter_off = 0; size_t fsize = folio_size(folio), flen = fsize, foff = 0; loff_t fpos = folio_pos(folio), i_size; bool to_eof = false, streamw = false; bool debug = false; _enter(""); if (rolling_buffer_make_space(&wreq->buffer) < 0) return -ENOMEM; /* netfs_perform_write() may shift i_size around the page or from out * of the page to beyond it, but cannot move i_size into or through the * page since we have it locked. */ i_size = i_size_read(wreq->inode); if (fpos >= i_size) { /* mmap beyond eof. */ _debug("beyond eof"); folio_start_writeback(folio); folio_unlock(folio); wreq->nr_group_rel += netfs_folio_written_back(folio); netfs_put_group_many(wreq->group, wreq->nr_group_rel); wreq->nr_group_rel = 0; return 0; } if (fpos + fsize > wreq->i_size) wreq->i_size = i_size; fgroup = netfs_folio_group(folio); finfo = netfs_folio_info(folio); if (finfo) { foff = finfo->dirty_offset; flen = foff + finfo->dirty_len; streamw = true; } if (wreq->origin == NETFS_WRITETHROUGH) { to_eof = false; if (flen > i_size - fpos) flen = i_size - fpos; } else if (flen > i_size - fpos) { flen = i_size - fpos; if (!streamw) folio_zero_segment(folio, flen, fsize); to_eof = true; } else if (flen == i_size - fpos) { to_eof = true; } flen -= foff; _debug("folio %zx %zx %zx", foff, flen, fsize); /* Deal with discontinuities in the stream of dirty pages. These can * arise from a number of sources: * * (1) Intervening non-dirty pages from random-access writes, multiple * flushers writing back different parts simultaneously and manual * syncing. * * (2) Partially-written pages from write-streaming. * * (3) Pages that belong to a different write-back group (eg. Ceph * snapshots). * * (4) Actually-clean pages that were marked for write to the cache * when they were read. Note that these appear as a special * write-back group. */ if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { netfs_issue_write(wreq, upload); } else if (fgroup != wreq->group) { /* We can't write this page to the server yet. */ kdebug("wrong group"); folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); netfs_issue_write(wreq, upload); netfs_issue_write(wreq, cache); return 0; } if (foff > 0) netfs_issue_write(wreq, upload); if (streamw) netfs_issue_write(wreq, cache); /* Flip the page to the writeback state and unlock. If we're called * from write-through, then the page has already been put into the wb * state. */ if (wreq->origin == NETFS_WRITEBACK) folio_start_writeback(folio); folio_unlock(folio); if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { if (!cache->avail) { trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); netfs_issue_write(wreq, upload); netfs_folio_written_back(folio); return 0; } trace_netfs_folio(folio, netfs_folio_trace_store_copy); } else if (!upload->avail && !cache->avail) { trace_netfs_folio(folio, netfs_folio_trace_cancel_store); netfs_folio_written_back(folio); return 0; } else if (!upload->construct) { trace_netfs_folio(folio, netfs_folio_trace_store); } else { trace_netfs_folio(folio, netfs_folio_trace_store_plus); } /* Attach the folio to the rolling buffer. */ rolling_buffer_append(&wreq->buffer, folio, 0); /* Move the submission point forward to allow for write-streaming data * not starting at the front of the page. We don't do write-streaming * with the cache as the cache requires DIO alignment. * * Also skip uploading for data that's been read and just needs copying * to the cache. */ for (int s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; stream->submit_off = foff; stream->submit_len = flen; if (!stream->avail || (stream->source == NETFS_WRITE_TO_CACHE && streamw) || (stream->source == NETFS_UPLOAD_TO_SERVER && fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { stream->submit_off = UINT_MAX; stream->submit_len = 0; } } /* Attach the folio to one or more subrequests. For a big folio, we * could end up with thousands of subrequests if the wsize is small - * but we might need to wait during the creation of subrequests for * network resources (eg. SMB credits). */ for (;;) { ssize_t part; size_t lowest_off = ULONG_MAX; int choose_s = -1; /* Always add to the lowest-submitted stream first. */ for (int s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; if (stream->submit_len > 0 && stream->submit_off < lowest_off) { lowest_off = stream->submit_off; choose_s = s; } } if (choose_s < 0) break; stream = &wreq->io_streams[choose_s]; /* Advance the iterator(s). */ if (stream->submit_off > iter_off) { rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off); iter_off = stream->submit_off; } atomic64_set(&wreq->issued_to, fpos + stream->submit_off); stream->submit_extendable_to = fsize - stream->submit_off; part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, stream->submit_len, to_eof); stream->submit_off += part; if (part > stream->submit_len) stream->submit_len = 0; else stream->submit_len -= part; if (part > 0) debug = true; } if (fsize > iter_off) rolling_buffer_advance(&wreq->buffer, fsize - iter_off); atomic64_set(&wreq->issued_to, fpos + fsize); if (!debug) kdebug("R=%x: No submit", wreq->debug_id); if (foff + flen < fsize) for (int s = 0; s < NR_IO_STREAMS; s++) netfs_issue_write(wreq, &wreq->io_streams[s]); _leave(" = 0"); return 0; } /* * End the issuing of writes, letting the collector know we're done. */ static void netfs_end_issue_write(struct netfs_io_request *wreq) { bool needs_poke = true; smp_wmb(); /* Write subreq lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); for (int s = 0; s < NR_IO_STREAMS; s++) { struct netfs_io_stream *stream = &wreq->io_streams[s]; if (!stream->active) continue; if (!list_empty(&stream->subrequests)) needs_poke = false; netfs_issue_write(wreq, stream); } if (needs_poke) netfs_wake_collector(wreq); } /* * Write some of the pending data back to the server */ int netfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct netfs_inode *ictx = netfs_inode(mapping->host); struct netfs_io_request *wreq = NULL; struct folio *folio; int error = 0; if (!mutex_trylock(&ictx->wb_lock)) { if (wbc->sync_mode == WB_SYNC_NONE) { netfs_stat(&netfs_n_wb_lock_skip); return 0; } netfs_stat(&netfs_n_wb_lock_wait); mutex_lock(&ictx->wb_lock); } /* Need the first folio to be able to set up the op. */ folio = writeback_iter(mapping, wbc, NULL, &error); if (!folio) goto out; wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK); if (IS_ERR(wreq)) { error = PTR_ERR(wreq); goto couldnt_start; } __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); trace_netfs_write(wreq, netfs_write_trace_writeback); netfs_stat(&netfs_n_wh_writepages); do { _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); /* It appears we don't have to handle cyclic writeback wrapping. */ WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to)); if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); wreq->netfs_ops->begin_writeback(wreq); } error = netfs_write_folio(wreq, wbc, folio); if (error < 0) break; } while ((folio = writeback_iter(mapping, wbc, folio, &error))); netfs_end_issue_write(wreq); mutex_unlock(&ictx->wb_lock); netfs_wake_collector(wreq); netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", error); return error; couldnt_start: netfs_kill_dirty_pages(mapping, wbc, folio); out: mutex_unlock(&ictx->wb_lock); _leave(" = %d", error); return error; } EXPORT_SYMBOL(netfs_writepages); /* * Begin a write operation for writing through the pagecache. */ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) { struct netfs_io_request *wreq = NULL; struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp)); mutex_lock(&ictx->wb_lock); wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, NETFS_WRITETHROUGH); if (IS_ERR(wreq)) { mutex_unlock(&ictx->wb_lock); return wreq; } wreq->io_streams[0].avail = true; trace_netfs_write(wreq, netfs_write_trace_writethrough); return wreq; } /* * Advance the state of the write operation used when writing through the * pagecache. Data has been copied into the pagecache that we need to append * to the request. If we've added more than wsize then we need to create a new * subrequest. */ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache) { _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end); if (!*writethrough_cache) { if (folio_test_dirty(folio)) /* Sigh. mmap. */ folio_clear_dirty_for_io(folio); /* We can make multiple writes to the folio... */ folio_start_writeback(folio); if (wreq->len == 0) trace_netfs_folio(folio, netfs_folio_trace_wthru); else trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); *writethrough_cache = folio; } wreq->len += copied; if (!to_page_end) return 0; *writethrough_cache = NULL; return netfs_write_folio(wreq, wbc, folio); } /* * End a write operation used when writing through the pagecache. */ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *writethrough_cache) { struct netfs_inode *ictx = netfs_inode(wreq->inode); ssize_t ret; _enter("R=%x", wreq->debug_id); if (writethrough_cache) netfs_write_folio(wreq, wbc, writethrough_cache); netfs_end_issue_write(wreq); mutex_unlock(&ictx->wb_lock); if (wreq->iocb) ret = -EIOCBQUEUED; else ret = netfs_wait_for_write(wreq); netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } /* * Write some of a pending folio data back to the server and/or the cache. */ static int netfs_write_folio_single(struct netfs_io_request *wreq, struct folio *folio) { struct netfs_io_stream *upload = &wreq->io_streams[0]; struct netfs_io_stream *cache = &wreq->io_streams[1]; struct netfs_io_stream *stream; size_t iter_off = 0; size_t fsize = folio_size(folio), flen; loff_t fpos = folio_pos(folio); bool to_eof = false; bool no_debug = false; _enter(""); flen = folio_size(folio); if (flen > wreq->i_size - fpos) { flen = wreq->i_size - fpos; folio_zero_segment(folio, flen, fsize); to_eof = true; } else if (flen == wreq->i_size - fpos) { to_eof = true; } _debug("folio %zx/%zx", flen, fsize); if (!upload->avail && !cache->avail) { trace_netfs_folio(folio, netfs_folio_trace_cancel_store); return 0; } if (!upload->construct) trace_netfs_folio(folio, netfs_folio_trace_store); else trace_netfs_folio(folio, netfs_folio_trace_store_plus); /* Attach the folio to the rolling buffer. */ folio_get(folio); rolling_buffer_append(&wreq->buffer, folio, NETFS_ROLLBUF_PUT_MARK); /* Move the submission point forward to allow for write-streaming data * not starting at the front of the page. We don't do write-streaming * with the cache as the cache requires DIO alignment. * * Also skip uploading for data that's been read and just needs copying * to the cache. */ for (int s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; stream->submit_off = 0; stream->submit_len = flen; if (!stream->avail) { stream->submit_off = UINT_MAX; stream->submit_len = 0; } } /* Attach the folio to one or more subrequests. For a big folio, we * could end up with thousands of subrequests if the wsize is small - * but we might need to wait during the creation of subrequests for * network resources (eg. SMB credits). */ for (;;) { ssize_t part; size_t lowest_off = ULONG_MAX; int choose_s = -1; /* Always add to the lowest-submitted stream first. */ for (int s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; if (stream->submit_len > 0 && stream->submit_off < lowest_off) { lowest_off = stream->submit_off; choose_s = s; } } if (choose_s < 0) break; stream = &wreq->io_streams[choose_s]; /* Advance the iterator(s). */ if (stream->submit_off > iter_off) { rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off); iter_off = stream->submit_off; } atomic64_set(&wreq->issued_to, fpos + stream->submit_off); stream->submit_extendable_to = fsize - stream->submit_off; part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, stream->submit_len, to_eof); stream->submit_off += part; if (part > stream->submit_len) stream->submit_len = 0; else stream->submit_len -= part; if (part > 0) no_debug = true; } wreq->buffer.iter.iov_offset = 0; if (fsize > iter_off) rolling_buffer_advance(&wreq->buffer, fsize - iter_off); atomic64_set(&wreq->issued_to, fpos + fsize); if (!no_debug) kdebug("R=%x: No submit", wreq->debug_id); _leave(" = 0"); return 0; } /** * netfs_writeback_single - Write back a monolithic payload * @mapping: The mapping to write from * @wbc: Hints from the VM * @iter: Data to write, must be ITER_FOLIOQ. * * Write a monolithic, non-pagecache object back to the server and/or * the cache. */ int netfs_writeback_single(struct address_space *mapping, struct writeback_control *wbc, struct iov_iter *iter) { struct netfs_io_request *wreq; struct netfs_inode *ictx = netfs_inode(mapping->host); struct folio_queue *fq; size_t size = iov_iter_count(iter); int ret; if (WARN_ON_ONCE(!iov_iter_is_folioq(iter))) return -EIO; if (!mutex_trylock(&ictx->wb_lock)) { if (wbc->sync_mode == WB_SYNC_NONE) { netfs_stat(&netfs_n_wb_lock_skip); return 0; } netfs_stat(&netfs_n_wb_lock_wait); mutex_lock(&ictx->wb_lock); } wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE); if (IS_ERR(wreq)) { ret = PTR_ERR(wreq); goto couldnt_start; } __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); trace_netfs_write(wreq, netfs_write_trace_writeback_single); netfs_stat(&netfs_n_wh_writepages); if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) wreq->netfs_ops->begin_writeback(wreq); for (fq = (struct folio_queue *)iter->folioq; fq; fq = fq->next) { for (int slot = 0; slot < folioq_count(fq); slot++) { struct folio *folio = folioq_folio(fq, slot); size_t part = umin(folioq_folio_size(fq, slot), size); _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); ret = netfs_write_folio_single(wreq, folio); if (ret < 0) goto stop; size -= part; if (size <= 0) goto stop; } } stop: for (int s = 0; s < NR_IO_STREAMS; s++) netfs_issue_write(wreq, &wreq->io_streams[s]); smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); mutex_unlock(&ictx->wb_lock); netfs_wake_collector(wreq); netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", ret); return ret; couldnt_start: mutex_unlock(&ictx->wb_lock); _leave(" = %d", ret); return ret; } EXPORT_SYMBOL(netfs_writeback_single);
2 22 6 13 1 13 22 22 5 6 5 4 4 2 4 3 6 23 5 23 2 22 11 1 1 11 23 22 7 6 7 3 3 3 3 1 1 1 1 46 37 3 1 3 46 46 25 48 25 48 4 46 48 42 6 6 2 4 15 6 15 13 2 2 2 2 2 1 1 2 2 2 1 1 1 1 7 6 5 4 1 3 3 1 3 1 1 1 3 3 7 55 53 51 55 51 50 49 49 6 49 1 33 29 28 27 29 27 26 27 26 25 25 24 24 1 23 1 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org> * * This software has been sponsored by Sophos Astaro <http://www.sophos.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_tables_compat.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_arp/arp_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 struct nft_xt_match_priv { void *info; }; static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, const char *tablename) { enum nft_chain_types type = NFT_CHAIN_T_DEFAULT; const struct nft_chain *chain = ctx->chain; const struct nft_base_chain *basechain; if (!tablename || !nft_is_base_chain(chain)) return 0; basechain = nft_base_chain(chain); if (strcmp(tablename, "nat") == 0) { if (ctx->family != NFPROTO_BRIDGE) type = NFT_CHAIN_T_NAT; if (basechain->type->type != type) return -EINVAL; } return 0; } union nft_entry { struct ipt_entry e4; struct ip6t_entry e6; struct ebt_entry ebt; struct arpt_entry arp; }; static inline void nft_compat_set_par(struct xt_action_param *par, const struct nft_pktinfo *pkt, const void *xt, const void *xt_info) { par->state = pkt->state; par->thoff = nft_thoff(pkt); par->fragoff = pkt->fragoff; par->target = xt; par->targinfo = xt_info; par->hotdrop = false; } static void nft_target_eval_xt(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; struct xt_action_param xt; int ret; nft_compat_set_par(&xt, pkt, target, info); ret = target->target(skb, &xt); if (xt.hotdrop) ret = NF_DROP; switch (ret) { case XT_CONTINUE: regs->verdict.code = NFT_CONTINUE; break; default: regs->verdict.code = ret; break; } } static void nft_target_eval_bridge(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; struct xt_action_param xt; int ret; nft_compat_set_par(&xt, pkt, target, info); ret = target->target(skb, &xt); if (xt.hotdrop) ret = NF_DROP; switch (ret) { case EBT_ACCEPT: regs->verdict.code = NF_ACCEPT; break; case EBT_DROP: regs->verdict.code = NF_DROP; break; case EBT_CONTINUE: regs->verdict.code = NFT_CONTINUE; break; case EBT_RETURN: regs->verdict.code = NFT_RETURN; break; default: regs->verdict.code = ret; break; } } static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING, .len = XT_EXTENSION_MAXNAMELEN, }, [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; static void nft_target_set_tgchk_param(struct xt_tgchk_param *par, const struct nft_ctx *ctx, struct xt_target *target, void *info, union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: if (proto) entry->e6.ipv6.flags |= IP6T_F_PROTO; entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; case NFPROTO_ARP: break; } par->entryinfo = entry; par->target = target; par->targinfo = info; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; par->hook_mask = 1 << ops->hooknum; } else { par->hook_mask = 0; } par->family = ctx->family; par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) { int pad; memcpy(out, in, t->targetsize); pad = XT_ALIGN(t->targetsize) - t->targetsize; if (pad > 0) memset(out + t->targetsize, 0, pad); } static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 }, [NFTA_RULE_COMPAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NFT_RULE_COMPAT_F_MASK), }; static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; u32 l4proto; u32 flags; int err; err = nla_parse_nested_deprecated(tb, NFTA_RULE_COMPAT_MAX, attr, nft_rule_compat_policy, NULL); if (err < 0) return err; if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS]) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); if (flags & NFT_RULE_COMPAT_F_UNUSED || flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); if (l4proto > U16_MAX) return -EINVAL; *proto = l4proto; return 0; } static void nft_compat_wait_for_destructors(struct net *net) { /* xtables matches or targets can have side effects, e.g. * creation/destruction of /proc files. * The xt ->destroy functions are run asynchronously from * work queue. If we have pending invocations we thus * need to wait for those to finish. */ nf_tables_trans_destroy_flush_work(net); } static int nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct xt_tgchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); if (ctx->nla[NFTA_RULE_COMPAT]) { ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); if (ret < 0) return ret; } nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); nft_compat_wait_for_destructors(ctx->net); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { if (ret == -ENOENT) { const char *modname = NULL; if (strcmp(target->name, "LOG") == 0) modname = "nf_log_syslog"; else if (strcmp(target->name, "NFLOG") == 0) modname = "nfnetlink_log"; if (modname && nft_request_module(ctx->net, "%s", modname) == -EAGAIN) return -EAGAIN; } return ret; } /* The standard target cannot be used */ if (!target->target) return -EINVAL; return 0; } static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr) { module_put(me); kfree(expr->ops); } static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); struct module *me = target->me; struct xt_tgdtor_param par; par.net = ctx->net; par.target = target; par.targinfo = info; par.family = ctx->family; if (par.target->destroy != NULL) par.target->destroy(&par); __nft_mt_tg_destroy(me, expr); } static int nft_extension_dump_info(struct sk_buff *skb, int attr, const void *info, unsigned int size, unsigned int user_size) { unsigned int info_size, aligned_size = XT_ALIGN(size); struct nlattr *nla; nla = nla_reserve(skb, attr, aligned_size); if (!nla) return -1; info_size = user_size ? : size; memcpy(nla_data(nla), info, info_size); memset(nla_data(nla) + info_size, 0, aligned_size - info_size); return 0; } static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || nft_extension_dump_info(skb, NFTA_TARGET_INFO, info, target->targetsize, target->usersize)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; ret = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING)); if (ret) return ret; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; hook_mask = 1 << ops->hooknum; if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; } return 0; } static void __nft_match_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, void *info) { struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; struct xt_action_param xt; bool ret; nft_compat_set_par(&xt, pkt, match, info); ret = match->match(skb, &xt); if (xt.hotdrop) { regs->verdict.code = NF_DROP; return; } switch (ret ? 1 : 0) { case 1: regs->verdict.code = NFT_CONTINUE; break; case 0: regs->verdict.code = NFT_BREAK; break; } } static void nft_match_large_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_xt_match_priv *priv = nft_expr_priv(expr); __nft_match_eval(expr, regs, pkt, priv->info); } static void nft_match_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { __nft_match_eval(expr, regs, pkt, nft_expr_priv(expr)); } static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING, .len = XT_EXTENSION_MAXNAMELEN }, [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; /* struct xt_mtchk_param and xt_tgchk_param look very similar */ static void nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: if (proto) entry->e6.ipv6.flags |= IP6T_F_PROTO; entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; case NFPROTO_ARP: break; } par->entryinfo = entry; par->match = match; par->matchinfo = info; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; par->hook_mask = 1 << ops->hooknum; } else { par->hook_mask = 0; } par->family = ctx->family; par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) { int pad; memcpy(out, in, m->matchsize); pad = XT_ALIGN(m->matchsize) - m->matchsize; if (pad > 0) memset(out + m->matchsize, 0, pad); } static int __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[], void *info) { struct xt_match *match = expr->ops->data; struct xt_mtchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); if (ctx->nla[NFTA_RULE_COMPAT]) { ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); if (ret < 0) return ret; } nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); nft_compat_wait_for_destructors(ctx->net); return xt_check_match(&par, size, proto, inv); } static int nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr)); } static int nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xt_match_priv *priv = nft_expr_priv(expr); struct xt_match *m = expr->ops->data; int ret; priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); if (!priv->info) return -ENOMEM; ret = __nft_match_init(ctx, expr, tb, priv->info); if (ret) kfree(priv->info); return ret; } static void __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; struct module *me = match->me; struct xt_mtdtor_param par; par.net = ctx->net; par.match = match; par.matchinfo = info; par.family = ctx->family; if (par.match->destroy != NULL) par.match->destroy(&par); __nft_mt_tg_destroy(me, expr); } static void nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { __nft_match_destroy(ctx, expr, nft_expr_priv(expr)); } static void nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_xt_match_priv *priv = nft_expr_priv(expr); __nft_match_destroy(ctx, expr, priv->info); kfree(priv->info); } static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || nft_extension_dump_info(skb, NFTA_MATCH_INFO, info, match->matchsize, match->usersize)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { return __nft_match_dump(skb, expr, nft_expr_priv(expr)); } static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e, bool reset) { struct nft_xt_match_priv *priv = nft_expr_priv(e); return __nft_match_dump(skb, e, priv->info); } static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; ret = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING)); if (ret) return ret; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; hook_mask = 1 << ops->hooknum; if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; } return 0; } static int nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 family, const char *name, int rev, int target) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_compat_get_rcu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { u8 family = info->nfmsg->nfgen_family; const char *name, *fmt; struct sk_buff *skb2; int ret = 0, target; u32 rev; if (tb[NFTA_COMPAT_NAME] == NULL || tb[NFTA_COMPAT_REV] == NULL || tb[NFTA_COMPAT_TYPE] == NULL) return -EINVAL; name = nla_data(tb[NFTA_COMPAT_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); /* x_tables api checks for 'target == 1' to mean target, * everything else means 'match'. * In x_tables world, the number is set by kernel, not * userspace. */ target = nla_get_be32(tb[NFTA_COMPAT_TYPE]) == htonl(1); switch(family) { case AF_INET: fmt = "ipt_%s"; break; case AF_INET6: fmt = "ip6t_%s"; break; case NFPROTO_BRIDGE: fmt = "ebt_%s"; break; case NFPROTO_ARP: fmt = "arpt_%s"; break; default: pr_err("nft_compat: unsupported protocol %d\n", family); return -EINVAL; } if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); try_then_request_module(xt_find_revision(family, name, rev, target, &ret), fmt, name); if (ret < 0) goto out_put; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; goto out_put; } /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, family, name, ret, target) <= 0) { kfree_skb(skb2); goto out_put; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); return ret; } static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, .len = NFT_COMPAT_NAME_MAX-1 }, [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get_rcu, .type = NFNL_CB_RCU, .attr_count = NFTA_COMPAT_MAX, .policy = nfnl_compat_policy_get }, }; static const struct nfnetlink_subsystem nfnl_compat_subsys = { .name = "nft-compat", .subsys_id = NFNL_SUBSYS_NFT_COMPAT, .cb_count = NFNL_MSG_COMPAT_MAX, .cb = nfnl_nft_compat_cb, }; static struct nft_expr_type nft_match_type; static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_expr_ops *ops; struct xt_match *match; unsigned int matchsize; char *mt_name; u32 rev, family; int err; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || tb[NFTA_MATCH_INFO] == NULL) return ERR_PTR(-EINVAL); mt_name = nla_data(tb[NFTA_MATCH_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); family = ctx->family; match = xt_request_find_match(family, mt_name, rev); if (IS_ERR(match)) return ERR_PTR(-ENOENT); if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) { err = -EINVAL; goto err; } ops = kzalloc_obj(struct nft_expr_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; } ops->type = &nft_match_type; ops->eval = nft_match_eval; ops->init = nft_match_init; ops->destroy = nft_match_destroy; ops->dump = nft_match_dump; ops->validate = nft_match_validate; ops->data = match; matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); if (matchsize > NFT_MATCH_LARGE_THRESH) { matchsize = NFT_EXPR_SIZE(sizeof(struct nft_xt_match_priv)); ops->eval = nft_match_large_eval; ops->init = nft_match_large_init; ops->destroy = nft_match_large_destroy; ops->dump = nft_match_large_dump; } ops->size = matchsize; return ops; err: module_put(match->me); return ERR_PTR(err); } static void nft_match_release_ops(const struct nft_expr_ops *ops) { struct xt_match *match = ops->data; module_put(match->me); kfree(ops); } static struct nft_expr_type nft_match_type __read_mostly = { .name = "match", .select_ops = nft_match_select_ops, .release_ops = nft_match_release_ops, .policy = nft_match_policy, .maxattr = NFTA_MATCH_MAX, .owner = THIS_MODULE, }; static struct nft_expr_type nft_target_type; static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_expr_ops *ops; struct xt_target *target; char *tg_name; u32 rev, family; int err; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || tb[NFTA_TARGET_INFO] == NULL) return ERR_PTR(-EINVAL); tg_name = nla_data(tb[NFTA_TARGET_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); family = ctx->family; if (strcmp(tg_name, XT_ERROR_TARGET) == 0 || strcmp(tg_name, XT_STANDARD_TARGET) == 0 || strcmp(tg_name, "standard") == 0) return ERR_PTR(-EINVAL); target = xt_request_find_target(family, tg_name, rev); if (IS_ERR(target)) return ERR_PTR(-ENOENT); if (!target->target) { err = -EINVAL; goto err; } if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { err = -EINVAL; goto err; } ops = kzalloc_obj(struct nft_expr_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; } ops->type = &nft_target_type; ops->size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); ops->init = nft_target_init; ops->destroy = nft_target_destroy; ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; if (family == NFPROTO_BRIDGE) ops->eval = nft_target_eval_bridge; else ops->eval = nft_target_eval_xt; return ops; err: module_put(target->me); return ERR_PTR(err); } static void nft_target_release_ops(const struct nft_expr_ops *ops) { struct xt_target *target = ops->data; module_put(target->me); kfree(ops); } static struct nft_expr_type nft_target_type __read_mostly = { .name = "target", .select_ops = nft_target_select_ops, .release_ops = nft_target_release_ops, .policy = nft_target_policy, .maxattr = NFTA_TARGET_MAX, .owner = THIS_MODULE, }; static int __init nft_compat_module_init(void) { int ret; ret = nft_register_expr(&nft_match_type); if (ret < 0) return ret; ret = nft_register_expr(&nft_target_type); if (ret < 0) goto err_match; ret = nfnetlink_subsys_register(&nfnl_compat_subsys); if (ret < 0) { pr_err("nft_compat: cannot register with nfnetlink.\n"); goto err_target; } return ret; err_target: nft_unregister_expr(&nft_target_type); err_match: nft_unregister_expr(&nft_match_type); return ret; } static void __exit nft_compat_module_exit(void) { nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); module_init(nft_compat_module_init); module_exit(nft_compat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("match"); MODULE_ALIAS_NFT_EXPR("target"); MODULE_DESCRIPTION("x_tables over nftables support");
104 99 36 36 29 24 30 142 146 22 30 30 22 23 30 107 104 10 91 53 21 33 30 16 33 33 29 33 16 33 33 32 4 32 15 9 15 17 32 32 33 33 17 17 17 7 17 17 17 17 17 1 8 8 8 8 7 6 8 8 8 7 7 7 7 6 5 1 1 13 13 12 12 13 84 84 82 84 84 30 1 1 30 30 28 12 12 12 12 28 28 28 24 8 6 25 12 12 12 12 24 25 16 1 27 27 30 3 30 30 30 8 13 37 24 8 12 42 107 107 107 63 107 6 93 94 81 91 31 42 60 51 59 59 59 50 59 57 57 57 17 24 24 24 24 19 19 43 4 44 45 31 1 55 42 70 46 17 19 19 26 18 18 18 11 18 12 12 12 12 12 11 11 10 10 11 12 10 18 19 15 16 15 5 19 19 19 10 8 7 7 7 2 15 15 15 15 15 11 11 11 11 1 11 11 1 10 11 11 11 10 8 11 11 2 15 15 2 3 3 3 2 3 3 3 3 5 5 5 5 24 24 24 24 24 24 24 12 21 21 16 5 16 16 16 16 1 16 16 16 16 6 6 3 3 15 5 5 5 4 3 3 3 2 2 3 23 7 7 10 24 22 24 24 24 3 23 6 31 30 30 29 28 26 6 6 6 6 5 39 1 38 19 6 6 6 2 1 6 35 35 14 39 40 40 40 40 40 35 33 31 32 31 31 26 26 25 26 26 25 6 1 6 6 6 5 6 6 23 8 7 23 24 20 20 10 20 10 19 34 35 35 35 34 40 20 68 68 68 68 68 28 16 16 12 16 8 16 15 16 48 48 48 48 48 48 48 2 48 48 48 48 11 8 48 14 14 14 14 14 14 8 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 // SPDX-License-Identifier: GPL-2.0-only /* * mm/userfaultfd.c * * Copyright (C) 2015 Red Hat, Inc. */ #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { /* Make sure that the dst range is fully within dst_vma. */ if (dst_end > dst_vma->vm_end) return false; /* * Check the vma is registered in uffd, this is required to * enforce the VM_MAYWRITE check done at uffd registration * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) return false; return true; } static __always_inline struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; mmap_assert_locked(mm); vma = vma_lookup(mm, addr); if (!vma) vma = ERR_PTR(-ENOENT); else if (!(vma->vm_flags & VM_SHARED) && unlikely(anon_vma_prepare(vma))) vma = ERR_PTR(-ENOMEM); return vma; } #ifdef CONFIG_PER_VMA_LOCK /* * uffd_lock_vma() - Lookup and lock vma corresponding to @address. * @mm: mm to search vma in. * @address: address that the vma should contain. * * Should be called without holding mmap_lock. * * Return: A locked vma containing @address, -ENOENT if no vma is found, or * -ENOMEM if anon_vma couldn't be allocated. */ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, unsigned long address) { struct vm_area_struct *vma; vma = lock_vma_under_rcu(mm, address); if (vma) { /* * We know we're going to need to use anon_vma, so check * that early. */ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) vma_end_read(vma); else return vma; } mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); if (!IS_ERR(vma)) { bool locked = vma_start_read_locked(vma); if (!locked) vma = ERR_PTR(-EAGAIN); } mmap_read_unlock(mm); return vma; } static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len) { struct vm_area_struct *dst_vma; dst_vma = uffd_lock_vma(dst_mm, dst_start); if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) return dst_vma; vma_end_read(dst_vma); return ERR_PTR(-ENOENT); } static void uffd_mfill_unlock(struct vm_area_struct *vma) { vma_end_read(vma); } #else static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len) { struct vm_area_struct *dst_vma; mmap_read_lock(dst_mm); dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); if (IS_ERR(dst_vma)) goto out_unlock; if (validate_dst_vma(dst_vma, dst_start + len)) return dst_vma; dst_vma = ERR_PTR(-ENOENT); out_unlock: mmap_read_unlock(dst_mm); return dst_vma; } static void uffd_mfill_unlock(struct vm_area_struct *vma) { mmap_read_unlock(vma->vm_mm); } #endif /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) { struct inode *inode; pgoff_t offset, max_off; if (!dst_vma->vm_file) return false; inode = dst_vma->vm_file->f_inode; offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); return offset >= max_off; } /* * Install PTEs, to map dst_addr (within dst_vma) to page. * * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); pte_t dst_ptep; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; if (writable) _dst_pte = pte_mkwrite(_dst_pte, dst_vma); if (flags & MFILL_ATOMIC_WP) _dst_pte = pte_mkuffd_wp(_dst_pte); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; goto out_unlock; } ret = -EEXIST; dst_ptep = ptep_get(dst_pte); /* * We are allowed to overwrite a UFFD pte marker: consider when both * MISSING|WP registered, we firstly wr-protect a none pte which has no * page cache page backing it, then access the page. */ if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } /* * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap(). */ inc_mm_counter(dst_mm, mm_counter(folio)); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); out: return ret; } static int mfill_atomic_pte_copy(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { void *kaddr; int ret; struct folio *folio; if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, dst_addr); if (!folio) goto out; kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock. */ pagefault_disable(); ret = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); pagefault_enable(); kunmap_local(kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; *foliop = folio; /* don't free the page */ goto out; } flush_dcache_folio(folio); } else { folio = *foliop; *foliop = NULL; } /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); ret = -ENOMEM; if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) goto out_release; ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, flags); if (ret) goto out_release; out: return ret; out_release: folio_put(folio); goto out; } static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) { struct folio *folio; int ret = -ENOMEM; folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); if (!folio) return ret; if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) goto out_put; /* * The memory barrier inside __folio_mark_uptodate makes sure that * zeroing out the folio become visible before mapping the page * using set_pte_at(). See do_anonymous_page(). */ __folio_mark_uptodate(folio); ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, 0); if (ret) goto out_put; return 0; out_put: folio_put(folio); return ret; } static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) { pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; if (mm_forbids_zeropage(dst_vma->vm_mm)) return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; goto out_unlock; } ret = -EEXIST; if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); out: return ret; } /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, uffd_flags_t flags) { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); struct folio *folio; struct page *page; int ret; ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; if (ret) goto out; if (!folio) { ret = -EFAULT; goto out; } page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; goto out_release; } ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, page, false, flags); if (ret) goto out_release; folio_unlock(folio); ret = 0; out: return ret; out_release: folio_unlock(folio); folio_put(folio); goto out; } /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; goto out_unlock; } ret = -EEXIST; /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); out: return ret; } static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, address); if (!pud) return NULL; /* * Note that we didn't run this because the pmd was * missing, the *pmd may be already established and in * turn it may also be a trans_huge_pmd. */ return pmd_alloc(mm, pud, address); } #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is * called with either vma-lock or mmap_lock held, it will release the lock * before returning. */ static __always_inline ssize_t mfill_atomic_hugetlb( struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; long copied; struct folio *folio; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; struct address_space *mapping; /* * There is no default zero huge page for all huge page sizes as * supported by hugetlb. A PMD_SIZE huge pages may exist as used * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); return -EINVAL; } src_addr = src_start; dst_addr = dst_start; copied = 0; folio = NULL; vma_hpagesize = vma_kernel_pagesize(dst_vma); /* * Validate alignment based on huge page size */ err = -EINVAL; if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) goto out_unlock; retry: /* * On routine entry dst_vma is set. If we had to drop mmap_lock and * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); if (IS_ERR(dst_vma)) { err = PTR_ERR(dst_vma); goto out; } err = -ENOENT; if (!is_vm_hugetlb_page(dst_vma)) goto out_unlock_vma; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) goto out_unlock_vma; /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ down_read(&ctx->map_changing_lock); err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock; } while (src_addr < src_start + len) { VM_WARN_ON_ONCE(dst_addr >= dst_start + len); /* * Serialize via vma_lock and hugetlb_fault_mutex. * vma_lock ensures the dst_pte remains valid even * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ idx = hugetlb_linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(dst_vma); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { err = -EEXIST; hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, src_addr, flags, &folio); hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); cond_resched(); if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); VM_WARN_ON_ONCE(!folio); err = copy_folio_from_user(folio, (const void __user *)src_addr, true); if (unlikely(err)) { err = -EFAULT; goto out; } dst_vma = NULL; goto retry; } else VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += vma_hpagesize; src_addr += vma_hpagesize; copied += vma_hpagesize; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&ctx->map_changing_lock); out_unlock_vma: uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { ssize_t err; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { return mfill_atomic_pte_continue(dst_pmd, dst_vma, dst_addr, flags); } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { return mfill_atomic_pte_poison(dst_pmd, dst_vma, dst_addr, flags); } /* * The normal page fault path for a shmem will invoke the * fault, fill the hole in the file and COW it right away. The * result generates plain anonymous memory. So when we are * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll * generate anonymous memory directly without actually filling * the hole. For the MAP_PRIVATE case the robustness check * only happens in the pagetable (to verify it's still none) * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) err = mfill_atomic_pte_copy(dst_pmd, dst_vma, dst_addr, src_addr, flags, foliop); else err = mfill_atomic_pte_zeropage(dst_pmd, dst_vma, dst_addr); } else { err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, foliop); } return err; } static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { struct mm_struct *dst_mm = ctx->mm; struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; unsigned long src_addr, dst_addr; long copied; struct folio *folio; /* * Sanitize the command parameters: */ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ VM_WARN_ON_ONCE(src_start + len <= src_start); VM_WARN_ON_ONCE(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; copied = 0; folio = NULL; retry: /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); if (IS_ERR(dst_vma)) { err = PTR_ERR(dst_vma); goto out; } /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ down_read(&ctx->map_changing_lock); err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. */ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && dst_vma->vm_flags & VM_SHARED)) goto out_unlock; /* * validate 'mode' now that we know the dst_vma: don't allow * a wrprotect copy if the userfaultfd didn't register as WP. */ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start, len, flags); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; if (!vma_is_shmem(dst_vma) && uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; while (src_addr < src_start + len) { pmd_t dst_pmdval; VM_WARN_ON_ONCE(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmdp_get_lockless(dst_pmd); if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is THP don't override it and just be strict. * (This includes the case where the PMD used to be THP and * changed back to none after __pte_alloc().) */ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } if (unlikely(pmd_bad(dst_pmdval))) { err = -EFAULT; break; } /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that. */ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); cond_resched(); if (unlikely(err == -ENOENT)) { void *kaddr; up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); VM_WARN_ON_ONCE(!folio); kaddr = kmap_local_folio(folio, 0); err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); kunmap_local(kaddr); if (unlikely(err)) { err = -EFAULT; goto out; } flush_dcache_folio(folio); goto retry; } else VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += PAGE_SIZE; src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { return mfill_atomic(ctx, dst_start, src_start, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); } ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len) { return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); } ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { /* * A caller might reasonably assume that UFFDIO_CONTINUE contains an * smp_wmb() to ensure that any writes to the about-to-be-mapped page by * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to * subsequent loads from the page through the newly mapped address range. */ smp_wmb(); return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); } long uffd_wp_range(struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { unsigned int mm_cp_flags; struct mmu_gather tlb; long ret; VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, "The address range exceeds VMA boundary.\n"); if (enable_wp) mm_cp_flags = MM_CP_UFFD_WP; else mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; /* * vma->vm_page_prot already reflects that uffd-wp is enabled for this * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed * to be write-protected as default whenever protection changes. * Try upgrading write permissions manually. */ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_vma->vm_mm); ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); return ret; } int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp) { struct mm_struct *dst_mm = ctx->mm; unsigned long end = start + len; unsigned long _start, _end; struct vm_area_struct *dst_vma; unsigned long page_mask; long err; VMA_ITERATOR(vmi, dst_mm, start); /* * Sanitize the command parameters: */ VM_WARN_ON_ONCE(start & ~PAGE_MASK); VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ VM_WARN_ON_ONCE(start + len <= start); mmap_read_lock(dst_mm); /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ down_read(&ctx->map_changing_lock); err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -ENOENT; for_each_vma_range(vmi, dst_vma, end) { if (!userfaultfd_wp(dst_vma)) { err = -ENOENT; break; } if (is_vm_hugetlb_page(dst_vma)) { err = -EINVAL; page_mask = vma_kernel_pagesize(dst_vma) - 1; if ((start & page_mask) || (len & page_mask)) break; } _start = max(dst_vma->vm_start, start); _end = min(dst_vma->vm_end, end); err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); /* Return 0 on success, <0 on failures */ if (err < 0) break; err = 0; } out_unlock: up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return err; } void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2) __acquires(ptl1) __acquires(ptl2) { if (ptl1 > ptl2) swap(ptl1, ptl2); /* lock in virtual address order to avoid lock inversion */ spin_lock(ptl1); if (ptl1 != ptl2) spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); else __acquire(ptl2); } void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2) __releases(ptl1) __releases(ptl2) { spin_unlock(ptl1); if (ptl1 != ptl2) spin_unlock(ptl2); else __release(ptl2); } static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval) { return pte_same(ptep_get(src_pte), orig_src_pte) && pte_same(ptep_get(dst_pte), orig_dst_pte) && pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); } /* * Checks if the two ptes and the corresponding folio are eligible for batched * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. * * NOTE: folio's reference is not required as the whole operation is within * PTL's critical section. */ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, unsigned long src_addr, pte_t *src_pte, pte_t *dst_pte) { pte_t orig_dst_pte, orig_src_pte; struct folio *folio; orig_dst_pte = ptep_get(dst_pte); if (!pte_none(orig_dst_pte)) return NULL; orig_src_pte = ptep_get(src_pte); if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) return NULL; folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !folio_trylock(folio)) return NULL; if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { folio_unlock(folio); return NULL; } return folio; } /* * Moves src folios to dst in a batch as long as they are not large, and can * successfully take the lock via folio_trylock(). */ static long move_present_ptes(struct mm_struct *mm, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio **first_src_folio, unsigned long len) { int err = 0; struct folio *src_folio = *first_src_folio; unsigned long src_start = src_addr; unsigned long src_end; len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; src_end = pmd_addr_end(src_addr, src_addr + len); flush_cache_range(src_vma, src_addr, src_end); double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval)) { err = -EAGAIN; goto out; } if (folio_test_large(src_folio) || folio_maybe_dma_pinned(src_folio) || !PageAnonExclusive(&src_folio->page)) { err = -EBUSY; goto out; } /* It's safe to drop the reference now as the page-table is holding one. */ folio_put(*first_src_folio); *first_src_folio = NULL; lazy_mmu_mode_enable(); while (true) { orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { set_pte_at(mm, src_addr, src_pte, orig_src_pte); err = -EBUSY; break; } folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Set soft dirty bit so userspace can notice the pte was moved */ if (pgtable_supports_soft_dirty()) orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); if (pte_dirty(orig_src_pte)) orig_dst_pte = pte_mkdirty(orig_dst_pte); orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); src_addr += PAGE_SIZE; if (src_addr == src_end) break; dst_addr += PAGE_SIZE; dst_pte++; src_pte++; folio_unlock(src_folio); src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, dst_pte); if (!src_folio) break; } lazy_mmu_mode_disable(); if (src_addr > src_start) flush_tlb_range(src_vma, src_start, src_addr); if (src_folio) folio_unlock(src_folio); out: double_pt_unlock(dst_ptl, src_ptl); return src_addr > src_start ? src_addr - src_start : err; } static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio *src_folio, struct swap_info_struct *si, swp_entry_t entry) { /* * Check if the folio still belongs to the target swap entry after * acquiring the lock. Folio can be freed in the swap cache while * not locked. */ if (src_folio && unlikely(!folio_test_swapcache(src_folio) || entry.val != src_folio->swap.val)) return -EAGAIN; double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } /* * The src_folio resides in the swapcache, requiring an update to its * index and mapping to align with the dst_vma, where a swap-in may * occur and hit the swapcache after moving the PTE. */ if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); } else { /* * Check if the swap entry is cached after acquiring the src_pte * lock. Otherwise, we might miss a newly loaded swap cache folio. * * We are trying to catch newly added swap cache, the only possible case is * when a folio is swapped in and out again staying in swap cache, using the * same entry before the PTE check above. The PTL is acquired and released * twice, each time after updating the swap table. So holding * the PTL here ensures we see the updated value. */ if (swap_cache_has_folio(entry)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); if (pgtable_supports_soft_dirty()) orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); return PAGE_SIZE; } static int move_zeropage_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ptep_clear_flush(src_vma, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, zero_pte); double_pt_unlock(dst_ptl, src_ptl); return PAGE_SIZE; } /* * The mmap_lock for reading is held by the caller. Just move the page(s) * from src_pmd to dst_pmd if possible, and return number of bytes moved. * On failure, an error code is returned. */ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr, unsigned long len, __u64 mode) { struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; pmd_t dummy_pmdval; pmd_t dst_pmdval; struct folio *src_folio = NULL; struct mmu_notifier_range range; long ret = 0; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, src_addr + len); mmu_notifier_invalidate_range_start(&range); retry: /* * Use the maywrite version to indicate that dst_pte will be modified, * since dst_pte needs to be none, the subsequent pte_same() check * cannot prevent the dst_pte page from being freed concurrently, so we * also need to obtain dst_pmdval and recheck pmd_same() later. */ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { ret = -EAGAIN; goto out; } /* * Unlike dst_pte, the subsequent pte_same() check can ensure the * stability of the src_pte page, so there is no need to get pmdval, * just pass a dummy variable to it. */ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, &src_ptl); /* * We held the mmap_lock for reading so MADV_DONTNEED * can zap transparent huge pages under us, or the * transparent huge page fault can establish new * transparent huge pages under us. */ if (unlikely(!src_pte)) { ret = -EAGAIN; goto out; } /* Sanity checks before the operation */ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { ret = -EINVAL; goto out; } spin_lock(dst_ptl); orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { ret = -EEXIST; goto out; } spin_lock(src_ptl); orig_src_pte = ptep_get(src_pte); spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) ret = -ENOENT; else /* nothing to do to move a hole */ ret = PAGE_SIZE; goto out; } /* If PTE changed after we locked the folio then start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { ret = -EAGAIN; goto out; } if (pte_present(orig_src_pte)) { if (is_zero_pfn(pte_pfn(orig_src_pte))) { ret = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl); goto out; } /* * Pin and lock source folio. Since we are in RCU read section, * we can't block, so on contention have to unmap the ptes, * obtain the lock and retry. */ if (!src_folio) { struct folio *folio; bool locked; /* * Pin the page while holding the lock to be sure the * page isn't freed under us */ spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); ret = -EAGAIN; goto out; } folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !PageAnonExclusive(&folio->page)) { spin_unlock(src_ptl); ret = -EBUSY; goto out; } locked = folio_trylock(folio); /* * We avoid waiting for folio lock with a raised * refcount for large folios because extra refcounts * will result in split_folio() failing later and * retrying. If multiple tasks are trying to move a * large folio we can end up livelocking. */ if (!locked && folio_test_large(folio)) { spin_unlock(src_ptl); ret = -EAGAIN; goto out; } folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); if (!locked) { pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); goto retry; } if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { ret = -EBUSY; goto out; } } /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; ret = split_folio(src_folio); if (ret) goto out; /* have to reacquire the folio after it got split */ folio_unlock(src_folio); folio_put(src_folio); src_folio = NULL; goto retry; } ret = move_present_ptes(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, len); } else { /* !pte_present() */ struct folio *folio = NULL; const softleaf_t entry = softleaf_from_pte(orig_src_pte); if (softleaf_is_migration(entry)) { pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); ret = -EAGAIN; goto out; } else if (!softleaf_is_swap(entry)) { ret = -EFAULT; goto out; } if (!pte_swp_exclusive(orig_src_pte)) { ret = -EBUSY; goto out; } si = get_swap_device(entry); if (unlikely(!si)) { ret = -EAGAIN; goto out; } /* * Verify the existence of the swapcache. If present, the folio's * index and mapping must be updated even when the PTE is a swap * entry. The anon_vma lock is not taken during this process since * the folio has already been unmapped, and the swap entry is * exclusive, preventing rmap walks. * * For large folios, return -EBUSY immediately, as split_folio() * also returns -EBUSY when attempting to split unmapped large * folios in the swapcache. This issue needs to be resolved * separately to allow proper handling. */ if (!src_folio) folio = swap_cache_get_folio(entry); if (folio) { if (folio_test_large(folio)) { ret = -EBUSY; folio_put(folio); goto out; } src_folio = folio; src_folio_pte = orig_src_pte; if (!folio_trylock(src_folio)) { pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; put_swap_device(si); si = NULL; /* now we can block and wait */ folio_lock(src_folio); goto retry; } } ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio, si, entry); } out: if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); } /* * Unmap in reverse order (LIFO) to maintain proper kmap_local * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte * first, then src_pte, so we must unmap src_pte first, then dst_pte. */ if (src_pte) pte_unmap(src_pte); if (dst_pte) pte_unmap(dst_pte); mmu_notifier_invalidate_range_end(&range); if (si) put_swap_device(si); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline bool move_splits_huge_pmd(unsigned long dst_addr, unsigned long src_addr, unsigned long src_end) { return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || src_end - src_addr < HPAGE_PMD_SIZE; } #else static inline bool move_splits_huge_pmd(unsigned long dst_addr, unsigned long src_addr, unsigned long src_end) { /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ return false; } #endif static inline bool vma_move_compatible(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | VM_MIXEDMAP | VM_SHADOW_STACK)); } static int validate_move_areas(struct userfaultfd_ctx *ctx, struct vm_area_struct *src_vma, struct vm_area_struct *dst_vma) { /* Only allow moving if both have the same access and protection */ if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) return -EINVAL; /* Only allow moving if both are mlocked or both aren't */ if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) return -EINVAL; /* * For now, we keep it simple and only move between writable VMAs. * Access flags are equal, therefore checking only the source is enough. */ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL; /* Check if vma flags indicate content which can be moved */ if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) return -EINVAL; /* Ensure dst_vma is registered in uffd we are operating on */ if (!dst_vma->vm_userfaultfd_ctx.ctx || dst_vma->vm_userfaultfd_ctx.ctx != ctx) return -EINVAL; /* Only allow moving across anonymous vmas */ if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) return -EINVAL; return 0; } static __always_inline int find_vmas_mm_locked(struct mm_struct *mm, unsigned long dst_start, unsigned long src_start, struct vm_area_struct **dst_vmap, struct vm_area_struct **src_vmap) { struct vm_area_struct *vma; mmap_assert_locked(mm); vma = find_vma_and_prepare_anon(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma); *dst_vmap = vma; /* Skip finding src_vma if src_start is in dst_vma */ if (src_start >= vma->vm_start && src_start < vma->vm_end) goto out_success; vma = vma_lookup(mm, src_start); if (!vma) return -ENOENT; out_success: *src_vmap = vma; return 0; } #ifdef CONFIG_PER_VMA_LOCK static int uffd_move_lock(struct mm_struct *mm, unsigned long dst_start, unsigned long src_start, struct vm_area_struct **dst_vmap, struct vm_area_struct **src_vmap) { struct vm_area_struct *vma; int err; vma = uffd_lock_vma(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma); *dst_vmap = vma; /* * Skip finding src_vma if src_start is in dst_vma. This also ensures * that we don't lock the same vma twice. */ if (src_start >= vma->vm_start && src_start < vma->vm_end) { *src_vmap = vma; return 0; } /* * Using uffd_lock_vma() to get src_vma can lead to following deadlock: * * Thread1 Thread2 * ------- ------- * vma_start_read(dst_vma) * mmap_write_lock(mm) * vma_start_write(src_vma) * vma_start_read(src_vma) * mmap_read_lock(mm) * vma_start_write(dst_vma) */ *src_vmap = lock_vma_under_rcu(mm, src_start); if (likely(*src_vmap)) return 0; /* Undo any locking and retry in mmap_lock critical section */ vma_end_read(*dst_vmap); mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (err) goto out; if (!vma_start_read_locked(*dst_vmap)) { err = -EAGAIN; goto out; } /* Nothing further to do if both vmas are locked. */ if (*dst_vmap == *src_vmap) goto out; if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { /* Undo dst_vmap locking if src_vmap failed to lock */ vma_end_read(*dst_vmap); err = -EAGAIN; } out: mmap_read_unlock(mm); return err; } static void uffd_move_unlock(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { vma_end_read(src_vma); if (src_vma != dst_vma) vma_end_read(dst_vma); } #else static int uffd_move_lock(struct mm_struct *mm, unsigned long dst_start, unsigned long src_start, struct vm_area_struct **dst_vmap, struct vm_area_struct **src_vmap) { int err; mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (err) mmap_read_unlock(mm); return err; } static void uffd_move_unlock(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { mmap_assert_locked(src_vma->vm_mm); mmap_read_unlock(dst_vma->vm_mm); } #endif /** * move_pages - move arbitrary anonymous pages of an existing vma * @ctx: pointer to the userfaultfd context * @dst_start: start of the destination virtual memory range * @src_start: start of the source virtual memory range * @len: length of the virtual memory range * @mode: flags from uffdio_move.mode * * It will either use the mmap_lock in read mode or per-vma locks * * move_pages() remaps arbitrary anonymous pages atomically in zero * copy. It only works on non shared anonymous pages because those can * be relocated without generating non linear anon_vmas in the rmap * code. * * It provides a zero copy mechanism to handle userspace page faults. * The source vma pages should have mapcount == 1, which can be * enforced by using madvise(MADV_DONTFORK) on src vma. * * The thread receiving the page during the userland page fault * will receive the faulting page in the source vma through the network, * storage or any other I/O device (MADV_DONTFORK in the source vma * avoids move_pages() to fail with -EBUSY if the process forks before * move_pages() is called), then it will call move_pages() to map the * page in the faulting address in the destination vma. * * This userfaultfd command works purely via pagetables, so it's the * most efficient way to move physical non shared anonymous pages * across different virtual addresses. Unlike mremap()/mmap()/munmap() * it does not create any new vmas. The mapping in the destination * address is atomic. * * It only works if the vma protection bits are identical from the * source and destination vma. * * It can remap non shared anonymous pages within the same vma too. * * If the source virtual memory range has any unmapped holes, or if * the destination virtual memory range is not a whole unmapped hole, * move_pages() will fail respectively with -ENOENT or -EEXIST. This * provides a very strict behavior to avoid any chance of memory * corruption going unnoticed if there are userland race conditions. * Only one thread should resolve the userland page fault at any given * time for any given faulting address. This means that if two threads * try to both call move_pages() on the same destination address at the * same time, the second thread will get an explicit error from this * command. * * The command retval will return "len" is successful. The command * however can be interrupted by fatal signals or errors. If * interrupted it will return the number of bytes successfully * remapped before the interruption if any, or the negative error if * none. It will never return zero. Either it will return an error or * an amount of bytes successfully moved. If the retval reports a * "short" remap, the move_pages() command should be repeated by * userland with src+retval, dst+reval, len-retval if it wants to know * about the error that interrupted it. * * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to * prevent -ENOENT errors to materialize if there are holes in the * source virtual range that is being remapped. The holes will be * accounted as successfully remapped in the retval of the * command. This is mostly useful to remap hugepage naturally aligned * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; unsigned long src_addr, dst_addr, src_end; pmd_t *src_pmd, *dst_pmd; long err = -EINVAL; ssize_t moved = 0; /* Sanitize the command parameters. */ VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ VM_WARN_ON_ONCE(src_start + len < src_start); VM_WARN_ON_ONCE(dst_start + len < dst_start); err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); if (err) goto out; /* Re-check after taking map_changing_lock */ err = -EAGAIN; down_read(&ctx->map_changing_lock); if (likely(atomic_read(&ctx->mmap_changing))) goto out_unlock; /* * Make sure the vma is not shared, that the src and dst remap * ranges are both valid and fully within a single existing * vma. */ err = -EINVAL; if (src_vma->vm_flags & VM_SHARED) goto out_unlock; if (src_start + len > src_vma->vm_end) goto out_unlock; if (dst_vma->vm_flags & VM_SHARED) goto out_unlock; if (dst_start + len > dst_vma->vm_end) goto out_unlock; err = validate_move_areas(ctx, src_vma, dst_vma); if (err) goto out_unlock; for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; src_addr < src_end;) { spinlock_t *ptl; pmd_t dst_pmdval; unsigned long step_size; /* * Below works because anonymous area would not have a * transparent huge PUD. If file-backed support is added, * that case would need to be handled here. */ src_pmd = mm_find_pmd(mm, src_addr); if (unlikely(!src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; break; } src_pmd = mm_alloc_pmd(mm, src_addr); if (unlikely(!src_pmd)) { err = -ENOMEM; break; } } dst_pmd = mm_alloc_pmd(mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is mapped as THP don't override it and just * be strict. If dst_pmd changes into TPH after this check, the * move_pages_huge_pmd() will detect the change and retry * while move_pages_pte() will detect the change and fail. */ if (unlikely(pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } ptl = pmd_trans_huge_lock(src_pmd, src_vma); if (ptl) { /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { /* Can be a migration entry */ if (pmd_present(*src_pmd)) { struct folio *folio = pmd_folio(*src_pmd); if (!is_huge_zero_folio(folio) && !PageAnonExclusive(&folio->page)) { spin_unlock(ptl); err = -EBUSY; break; } } spin_unlock(ptl); split_huge_pmd(src_vma, src_pmd, src_addr); /* The folio will be split by move_pages_pte() */ continue; } err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, dst_pmdval, dst_vma, src_vma, dst_addr, src_addr); step_size = HPAGE_PMD_SIZE; } else { long ret; if (pmd_none(*src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; break; } if (unlikely(__pte_alloc(mm, src_pmd))) { err = -ENOMEM; break; } } if (unlikely(pte_alloc(mm, dst_pmd))) { err = -ENOMEM; break; } ret = move_pages_ptes(mm, dst_pmd, src_pmd, dst_vma, src_vma, dst_addr, src_addr, src_end - src_addr, mode); if (ret < 0) err = ret; else step_size = ret; } cond_resched(); if (fatal_signal_pending(current)) { /* Do not override an error */ if (!err || err == -EAGAIN) err = -EINTR; break; } if (err) { if (err == -EAGAIN) continue; break; } /* Proceed to the next page */ dst_addr += step_size; src_addr += step_size; moved += step_size; } out_unlock: up_read(&ctx->map_changing_lock); uffd_move_unlock(dst_vma, src_vma); out: VM_WARN_ON_ONCE(moved < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!moved && !err); return moved ? moved : err; } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; vm_flags_reset(vma, vm_flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. */ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) vma_set_page_prot(vma); } static void userfaultfd_set_ctx(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx, vm_flags_t vm_flags) { vma_start_write(vma); vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; userfaultfd_set_vm_flags(vma, (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); } void userfaultfd_reset_ctx(struct vm_area_struct *vma) { userfaultfd_set_ctx(vma, NULL, 0); } struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct vm_area_struct *ret; bool give_up_on_oom = false; vma_flags_t new_vma_flags = vma->flags; vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); /* * If we are modifying only and not splitting, just give up on the merge * if OOM prevents us from merging successfully. */ if (start == vma->vm_start && end == vma->vm_end) give_up_on_oom = true; /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) uffd_wp_range(vma, start, end - start, false); ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, &new_vma_flags, NULL_VM_UFFD_CTX, give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ if (!IS_ERR(ret)) userfaultfd_reset_ctx(ret); return ret; } /* Assumes mmap write lock taken, and mm_struct pinned. */ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, vm_flags_t vm_flags, unsigned long start, unsigned long end, bool wp_async) { vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; vma_flags_t new_vma_flags; if (vma->vm_start < start) prev = vma; for_each_vma_range(vmi, vma, end) { cond_resched(); VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && vma_test_all_mask(vma, vma_flags)) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); new_vma_flags = vma->flags; vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); vma_flags_set_mask(&new_vma_flags, vma_flags); vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, &new_vma_flags, (struct vm_userfaultfd_ctx){ctx}, /* give_up_on_oom = */false); if (IS_ERR(vma)) return PTR_ERR(vma); /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ userfaultfd_set_ctx(vma, ctx, vm_flags); if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) hugetlb_unshare_all_pmds(vma); skip: prev = vma; start = vma->vm_end; } return 0; } void userfaultfd_release_new(struct userfaultfd_ctx *ctx) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == ctx) userfaultfd_reset_ctx(vma); } mmap_write_unlock(mm); } void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx) { struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, mm, 0); if (!mmget_not_zero(mm)) return; /* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx * changes while handle_userfault released the mmap_lock. So * it's critical that released is set to true (above), before * taking the mmap_lock for writing. */ mmap_write_lock(mm); prev = NULL; for_each_vma(vmi, vma) { cond_resched(); VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; } vma = userfaultfd_clear_vma(&vmi, prev, vma, vma->vm_start, vma->vm_end); prev = vma; } mmap_write_unlock(mm); mmput(mm); }
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _VKMS_CONFIG_H_ #define _VKMS_CONFIG_H_ #include <linux/list.h> #include <linux/types.h> #include <linux/xarray.h> #include <drm/drm_connector.h> #include "vkms_drv.h" /** * struct vkms_config - General configuration for VKMS driver * * @dev_name: Name of the device * @planes: List of planes configured for the device * @crtcs: List of CRTCs configured for the device * @encoders: List of encoders configured for the device * @connectors: List of connectors configured for the device * @dev: Used to store the current VKMS device. Only set when the device is instantiated. */ struct vkms_config { const char *dev_name; struct list_head planes; struct list_head crtcs; struct list_head encoders; struct list_head connectors; struct vkms_device *dev; }; /** * struct vkms_config_plane * * @link: Link to the others planes in vkms_config * @config: The vkms_config this plane belongs to * @type: Type of the plane. The creator of configuration needs to ensures that * at least one primary plane is present. * @possible_crtcs: Array of CRTCs that can be used with this plane * @plane: Internal usage. This pointer should never be considered as valid. * It can be used to store a temporary reference to a VKMS plane during * device creation. This pointer is not managed by the configuration and * must be managed by other means. */ struct vkms_config_plane { struct list_head link; struct vkms_config *config; enum drm_plane_type type; struct xarray possible_crtcs; bool default_pipeline; /* Internal usage */ struct vkms_plane *plane; }; /** * struct vkms_config_crtc * * @link: Link to the others CRTCs in vkms_config * @config: The vkms_config this CRTC belongs to * @writeback: If true, a writeback buffer can be attached to the CRTC * @crtc: Internal usage. This pointer should never be considered as valid. * It can be used to store a temporary reference to a VKMS CRTC during * device creation. This pointer is not managed by the configuration and * must be managed by other means. */ struct vkms_config_crtc { struct list_head link; struct vkms_config *config; bool writeback; /* Internal usage */ struct vkms_output *crtc; }; /** * struct vkms_config_encoder * * @link: Link to the others encoders in vkms_config * @config: The vkms_config this CRTC belongs to * @possible_crtcs: Array of CRTCs that can be used with this encoder * @encoder: Internal usage. This pointer should never be considered as valid. * It can be used to store a temporary reference to a VKMS encoder * during device creation. This pointer is not managed by the * configuration and must be managed by other means. */ struct vkms_config_encoder { struct list_head link; struct vkms_config *config; struct xarray possible_crtcs; /* Internal usage */ struct drm_encoder *encoder; }; /** * struct vkms_config_connector * * @link: Link to the others connector in vkms_config * @config: The vkms_config this connector belongs to * @status: Status (connected, disconnected...) of the connector * @possible_encoders: Array of encoders that can be used with this connector * @connector: Internal usage. This pointer should never be considered as valid. * It can be used to store a temporary reference to a VKMS connector * during device creation. This pointer is not managed by the * configuration and must be managed by other means. */ struct vkms_config_connector { struct list_head link; struct vkms_config *config; enum drm_connector_status status; struct xarray possible_encoders; /* Internal usage */ struct vkms_connector *connector; }; /** * vkms_config_for_each_plane - Iterate over the vkms_config planes * @config: &struct vkms_config pointer * @plane_cfg: &struct vkms_config_plane pointer used as cursor */ #define vkms_config_for_each_plane(config, plane_cfg) \ list_for_each_entry((plane_cfg), &(config)->planes, link) /** * vkms_config_for_each_crtc - Iterate over the vkms_config CRTCs * @config: &struct vkms_config pointer * @crtc_cfg: &struct vkms_config_crtc pointer used as cursor */ #define vkms_config_for_each_crtc(config, crtc_cfg) \ list_for_each_entry((crtc_cfg), &(config)->crtcs, link) /** * vkms_config_for_each_encoder - Iterate over the vkms_config encoders * @config: &struct vkms_config pointer * @encoder_cfg: &struct vkms_config_encoder pointer used as cursor */ #define vkms_config_for_each_encoder(config, encoder_cfg) \ list_for_each_entry((encoder_cfg), &(config)->encoders, link) /** * vkms_config_for_each_connector - Iterate over the vkms_config connectors * @config: &struct vkms_config pointer * @connector_cfg: &struct vkms_config_connector pointer used as cursor */ #define vkms_config_for_each_connector(config, connector_cfg) \ list_for_each_entry((connector_cfg), &(config)->connectors, link) /** * vkms_config_plane_for_each_possible_crtc - Iterate over the vkms_config_plane * possible CRTCs * @plane_cfg: &struct vkms_config_plane pointer * @idx: Index of the cursor * @possible_crtc: &struct vkms_config_crtc pointer used as cursor */ #define vkms_config_plane_for_each_possible_crtc(plane_cfg, idx, possible_crtc) \ xa_for_each(&(plane_cfg)->possible_crtcs, idx, (possible_crtc)) /** * vkms_config_encoder_for_each_possible_crtc - Iterate over the * vkms_config_encoder possible CRTCs * @encoder_cfg: &struct vkms_config_encoder pointer * @idx: Index of the cursor * @possible_crtc: &struct vkms_config_crtc pointer used as cursor */ #define vkms_config_encoder_for_each_possible_crtc(encoder_cfg, idx, possible_crtc) \ xa_for_each(&(encoder_cfg)->possible_crtcs, idx, (possible_crtc)) /** * vkms_config_connector_for_each_possible_encoder - Iterate over the * vkms_config_connector possible encoders * @connector_cfg: &struct vkms_config_connector pointer * @idx: Index of the cursor * @possible_encoder: &struct vkms_config_encoder pointer used as cursor */ #define vkms_config_connector_for_each_possible_encoder(connector_cfg, idx, possible_encoder) \ xa_for_each(&(connector_cfg)->possible_encoders, idx, (possible_encoder)) /** * vkms_config_create() - Create a new VKMS configuration * @dev_name: Name of the device * * Returns: * The new vkms_config or an error. Call vkms_config_destroy() to free the * returned configuration. */ struct vkms_config *vkms_config_create(const char *dev_name); /** * vkms_config_default_create() - Create the configuration for the default device * @enable_cursor: Create or not a cursor plane * @enable_writeback: Create or not a writeback connector * @enable_overlay: Create or not overlay planes * * Returns: * The default vkms_config or an error. Call vkms_config_destroy() to free the * returned configuration. */ struct vkms_config *vkms_config_default_create(bool enable_cursor, bool enable_writeback, bool enable_overlay, bool enable_plane_pipeline); /** * vkms_config_destroy() - Free a VKMS configuration * @config: vkms_config to free */ void vkms_config_destroy(struct vkms_config *config); /** * vkms_config_get_device_name() - Return the name of the device * @config: Configuration to get the device name from * * Returns: * The device name. Only valid while @config is valid. */ static inline const char * vkms_config_get_device_name(struct vkms_config *config) { return config->dev_name; } /** * vkms_config_get_num_crtcs() - Return the number of CRTCs in the configuration * @config: Configuration to get the number of CRTCs from */ static inline size_t vkms_config_get_num_crtcs(struct vkms_config *config) { return list_count_nodes(&config->crtcs); } /** * vkms_config_is_valid() - Validate a configuration * @config: Configuration to validate * * Returns: * Whether the configuration is valid or not. * For example, a configuration without primary planes is not valid. */ bool vkms_config_is_valid(const struct vkms_config *config); /** * vkms_config_register_debugfs() - Register a debugfs file to show the device's * configuration * @vkms_device: Device to register */ void vkms_config_register_debugfs(struct vkms_device *vkms_device); /** * vkms_config_create_plane() - Add a new plane configuration * @config: Configuration to add the plane to * * Returns: * The new plane configuration or an error. Call vkms_config_destroy_plane() to * free the returned plane configuration. */ struct vkms_config_plane *vkms_config_create_plane(struct vkms_config *config); /** * vkms_config_destroy_plane() - Remove and free a plane configuration * @plane_cfg: Plane configuration to destroy */ void vkms_config_destroy_plane(struct vkms_config_plane *plane_cfg); /** * vkms_config_plane_type() - Return the plane type * @plane_cfg: Plane to get the type from */ static inline enum drm_plane_type vkms_config_plane_get_type(struct vkms_config_plane *plane_cfg) { return plane_cfg->type; } /** * vkms_config_plane_set_type() - Set the plane type * @plane_cfg: Plane to set the type to * @type: New plane type */ static inline void vkms_config_plane_set_type(struct vkms_config_plane *plane_cfg, enum drm_plane_type type) { plane_cfg->type = type; } /** * vkms_config_plane_get_default_pipeline() - Return if the plane will * be created with the default pipeline * @plane_cfg: Plane to get the information from */ static inline bool vkms_config_plane_get_default_pipeline(struct vkms_config_plane *plane_cfg) { return plane_cfg->default_pipeline; } /** * vkms_config_plane_set_default_pipeline() - Set if the plane will * be created with the default pipeline * @plane_cfg: Plane to configure the pipeline * @default_pipeline: New default pipeline value */ static inline void vkms_config_plane_set_default_pipeline(struct vkms_config_plane *plane_cfg, bool default_pipeline) { plane_cfg->default_pipeline = default_pipeline; } /** * vkms_config_plane_attach_crtc - Attach a plane to a CRTC * @plane_cfg: Plane to attach * @crtc_cfg: CRTC to attach @plane_cfg to */ int __must_check vkms_config_plane_attach_crtc(struct vkms_config_plane *plane_cfg, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_plane_detach_crtc - Detach a plane from a CRTC * @plane_cfg: Plane to detach * @crtc_cfg: CRTC to detach @plane_cfg from */ void vkms_config_plane_detach_crtc(struct vkms_config_plane *plane_cfg, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_create_crtc() - Add a new CRTC configuration * @config: Configuration to add the CRTC to * * Returns: * The new CRTC configuration or an error. Call vkms_config_destroy_crtc() to * free the returned CRTC configuration. */ struct vkms_config_crtc *vkms_config_create_crtc(struct vkms_config *config); /** * vkms_config_destroy_crtc() - Remove and free a CRTC configuration * @config: Configuration to remove the CRTC from * @crtc_cfg: CRTC configuration to destroy */ void vkms_config_destroy_crtc(struct vkms_config *config, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_crtc_get_writeback() - If a writeback connector will be created * @crtc_cfg: CRTC with or without a writeback connector */ static inline bool vkms_config_crtc_get_writeback(struct vkms_config_crtc *crtc_cfg) { return crtc_cfg->writeback; } /** * vkms_config_crtc_set_writeback() - If a writeback connector will be created * @crtc_cfg: Target CRTC * @writeback: Enable or disable the writeback connector */ static inline void vkms_config_crtc_set_writeback(struct vkms_config_crtc *crtc_cfg, bool writeback) { crtc_cfg->writeback = writeback; } /** * vkms_config_crtc_primary_plane() - Return the primary plane for a CRTC * @config: Configuration containing the CRTC * @crtc_config: Target CRTC * * Note that, if multiple primary planes are found, the first one is returned. * In this case, the configuration will be invalid. See vkms_config_is_valid(). * * Returns: * The primary plane or NULL if none is assigned yet. */ struct vkms_config_plane *vkms_config_crtc_primary_plane(const struct vkms_config *config, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_crtc_cursor_plane() - Return the cursor plane for a CRTC * @config: Configuration containing the CRTC * @crtc_config: Target CRTC * * Note that, if multiple cursor planes are found, the first one is returned. * In this case, the configuration will be invalid. See vkms_config_is_valid(). * * Returns: * The cursor plane or NULL if none is assigned yet. */ struct vkms_config_plane *vkms_config_crtc_cursor_plane(const struct vkms_config *config, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_create_encoder() - Add a new encoder configuration * @config: Configuration to add the encoder to * * Returns: * The new encoder configuration or an error. Call vkms_config_destroy_encoder() * to free the returned encoder configuration. */ struct vkms_config_encoder *vkms_config_create_encoder(struct vkms_config *config); /** * vkms_config_destroy_encoder() - Remove and free a encoder configuration * @config: Configuration to remove the encoder from * @encoder_cfg: Encoder configuration to destroy */ void vkms_config_destroy_encoder(struct vkms_config *config, struct vkms_config_encoder *encoder_cfg); /** * vkms_config_encoder_attach_crtc - Attach a encoder to a CRTC * @encoder_cfg: Encoder to attach * @crtc_cfg: CRTC to attach @encoder_cfg to */ int __must_check vkms_config_encoder_attach_crtc(struct vkms_config_encoder *encoder_cfg, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_encoder_detach_crtc - Detach a encoder from a CRTC * @encoder_cfg: Encoder to detach * @crtc_cfg: CRTC to detach @encoder_cfg from */ void vkms_config_encoder_detach_crtc(struct vkms_config_encoder *encoder_cfg, struct vkms_config_crtc *crtc_cfg); /** * vkms_config_create_connector() - Add a new connector configuration * @config: Configuration to add the connector to * * Returns: * The new connector configuration or an error. Call * vkms_config_destroy_connector() to free the returned connector configuration. */ struct vkms_config_connector *vkms_config_create_connector(struct vkms_config *config); /** * vkms_config_destroy_connector() - Remove and free a connector configuration * @connector_cfg: Connector configuration to destroy */ void vkms_config_destroy_connector(struct vkms_config_connector *connector_cfg); /** * vkms_config_connector_attach_encoder - Attach a connector to an encoder * @connector_cfg: Connector to attach * @encoder_cfg: Encoder to attach @connector_cfg to */ int __must_check vkms_config_connector_attach_encoder(struct vkms_config_connector *connector_cfg, struct vkms_config_encoder *encoder_cfg); /** * vkms_config_connector_detach_encoder - Detach a connector from an encoder * @connector_cfg: Connector to detach * @encoder_cfg: Encoder to detach @connector_cfg from */ void vkms_config_connector_detach_encoder(struct vkms_config_connector *connector_cfg, struct vkms_config_encoder *encoder_cfg); /** * vkms_config_connector_get_status() - Return the status of the connector * @connector_cfg: Connector to get the status from */ static inline enum drm_connector_status vkms_config_connector_get_status(struct vkms_config_connector *connector_cfg) { return connector_cfg->status; } /** * vkms_config_connector_set_status() - Set the status of the connector * @connector_cfg: Connector to set the status to * @status: New connector status */ static inline void vkms_config_connector_set_status(struct vkms_config_connector *connector_cfg, enum drm_connector_status status) { connector_cfg->status = status; } #endif /* _VKMS_CONFIG_H_ */
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0 /* * NVMEM layout bus handling * * Copyright (C) 2023 Bootlin * Author: Miquel Raynal <miquel.raynal@bootlin.com */ #include <linux/device.h> #include <linux/dma-mapping.h> #include <linux/nvmem-consumer.h> #include <linux/nvmem-provider.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_irq.h> #include "internals.h" #define to_nvmem_layout_driver(drv) \ (container_of_const((drv), struct nvmem_layout_driver, driver)) #define to_nvmem_layout_device(_dev) \ container_of((_dev), struct nvmem_layout, dev) static int nvmem_layout_bus_match(struct device *dev, const struct device_driver *drv) { return of_driver_match_device(dev, drv); } static int nvmem_layout_bus_probe(struct device *dev) { struct nvmem_layout_driver *drv = to_nvmem_layout_driver(dev->driver); struct nvmem_layout *layout = to_nvmem_layout_device(dev); if (!drv->probe || !drv->remove) return -EINVAL; return drv->probe(layout); } static void nvmem_layout_bus_remove(struct device *dev) { struct nvmem_layout_driver *drv = to_nvmem_layout_driver(dev->driver); struct nvmem_layout *layout = to_nvmem_layout_device(dev); return drv->remove(layout); } static int nvmem_layout_bus_uevent(const struct device *dev, struct kobj_uevent_env *env) { int ret; ret = of_device_uevent_modalias(dev, env); if (ret != -ENODEV) return ret; return 0; } static const struct bus_type nvmem_layout_bus_type = { .name = "nvmem-layout", .match = nvmem_layout_bus_match, .probe = nvmem_layout_bus_probe, .remove = nvmem_layout_bus_remove, .uevent = nvmem_layout_bus_uevent, }; int __nvmem_layout_driver_register(struct nvmem_layout_driver *drv, struct module *owner) { drv->driver.bus = &nvmem_layout_bus_type; drv->driver.owner = owner; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__nvmem_layout_driver_register); void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(nvmem_layout_driver_unregister); static void nvmem_layout_release_device(struct device *dev) { struct nvmem_layout *layout = to_nvmem_layout_device(dev); of_node_put(layout->dev.of_node); kfree(layout); } static int nvmem_layout_create_device(struct nvmem_device *nvmem, struct device_node *np) { struct nvmem_layout *layout; struct device *dev; int ret; layout = kzalloc_obj(*layout); if (!layout) return -ENOMEM; /* Create a bidirectional link */ layout->nvmem = nvmem; nvmem->layout = layout; /* Device model registration */ dev = &layout->dev; device_initialize(dev); dev->parent = &nvmem->dev; dev->bus = &nvmem_layout_bus_type; dev->release = nvmem_layout_release_device; dev->coherent_dma_mask = DMA_BIT_MASK(32); dev->dma_mask = &dev->coherent_dma_mask; device_set_node(dev, of_fwnode_handle(of_node_get(np))); of_device_make_bus_id(dev); of_msi_configure(dev, dev->of_node); ret = device_add(dev); if (ret) { put_device(dev); return ret; } return 0; } static const struct of_device_id of_nvmem_layout_skip_table[] = { { .compatible = "fixed-layout", }, {} }; static int nvmem_layout_bus_populate(struct nvmem_device *nvmem, struct device_node *layout_dn) { int ret; /* Make sure it has a compatible property */ if (!of_property_present(layout_dn, "compatible")) { pr_debug("%s() - skipping %pOF, no compatible prop\n", __func__, layout_dn); return 0; } /* Fixed layouts are parsed manually somewhere else for now */ if (of_match_node(of_nvmem_layout_skip_table, layout_dn)) { pr_debug("%s() - skipping %pOF node\n", __func__, layout_dn); return 0; } if (of_node_check_flag(layout_dn, OF_POPULATED_BUS)) { pr_debug("%s() - skipping %pOF, already populated\n", __func__, layout_dn); return 0; } /* NVMEM layout buses expect only a single device representing the layout */ ret = nvmem_layout_create_device(nvmem, layout_dn); if (ret) return ret; of_node_set_flag(layout_dn, OF_POPULATED_BUS); return 0; } struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem) { return of_get_child_by_name(nvmem->dev.of_node, "nvmem-layout"); } EXPORT_SYMBOL_GPL(of_nvmem_layout_get_container); /* * Returns the number of devices populated, 0 if the operation was not relevant * for this nvmem device, an error code otherwise. */ int nvmem_populate_layout(struct nvmem_device *nvmem) { struct device_node *layout_dn; int ret; layout_dn = of_nvmem_layout_get_container(nvmem); if (!layout_dn) return 0; /* Populate the layout device */ device_links_supplier_sync_state_pause(); ret = nvmem_layout_bus_populate(nvmem, layout_dn); device_links_supplier_sync_state_resume(); of_node_put(layout_dn); return ret; } void nvmem_destroy_layout(struct nvmem_device *nvmem) { struct device *dev; if (!nvmem->layout) return; dev = &nvmem->layout->dev; of_node_clear_flag(dev->of_node, OF_POPULATED_BUS); device_unregister(dev); } int nvmem_layout_bus_register(void) { return bus_register(&nvmem_layout_bus_type); } void nvmem_layout_bus_unregister(void) { bus_unregister(&nvmem_layout_bus_type); }
3 5 6 4 6 3 3 3 2 2 3 1 3 3 1 1 1 2 2 3 2 2 2 2 2 2 2 2 3 2 11 11 11 11 9 3 3 3 3 3 7 18 1 18 16 12 11 11 11 11 11 11 11 11 11 11 11 5 11 11 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ #define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { next->ip = d->ip; next->port = d->port; next->ip2 = d->ip2; } #define MTYPE hash_ipportnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); e.ip2 &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { e.ip = htonl(ip); e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_from > ip2_to) swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); ip2 = ntohl(h->next.ip2); } else { p = port; ip2 = ip2_from; } for (; ip <= ip_to; ip++) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); do { i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; if (i > IPSET_MAX_RANGE) { hash_ipportnet4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } p = port; } return ret; } /* IPv6 variant */ struct hash_ipportnet6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet6_data_list(struct sk_buff *skb, const struct hash_ipportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); ip6_netmask(&e.ip2, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip2, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportnet_init(void) { return ip_set_type_register(&hash_ipportnet_type); } static void __exit hash_ipportnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } module_init(hash_ipportnet_init); module_exit(hash_ipportnet_fini);
136 25 35 35 28 36 36 35 27 27 35 35 35 35 3 35 17 17 3 16 17 16 16 16 16 16 16 16 16 17 2 2 21 19 21 22 22 12 12 12 12 12 13 13 12 13 12 13 12 13 12 13 13 13 13 13 2 1 2 1 2 3 3 3 3 3 3 3 3 3 3 3 1 4 6 6 6 3 3 3 1 4 4 4 4 3 4 2 4 6 5 5 5 5 3 2 5 4 7 6 6 1 5 2 3 2 1 2 2 2 2 2 1 2 2 2 2 7 27 27 27 14 26 26 26 25 6 6 6 5 5 5 4 5 5 6 6 6 6 6 14 14 12 14 1 13 13 1 12 12 12 12 11 12 12 12 12 6 4 4 4 1 1 1 1 16 18 26 27 5 6 6 1 5 5 3 3 2 2 1 2 3 6 4 4 4 4 4 47 49 5 6 6 4 26 6 4 48 49 33 2 2 743 33 33 33 711 741 91 44 91 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/sem.c * Copyright (C) 1992 Krishna Balasubramanian * Copyright (C) 1995 Eric Schenk, Bruno Haible * * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * SMP-threaded, sysctl's added * (c) 1999 Manfred Spraul <manfred@colorfullife.com> * Enforced range limit on SEM_UNDO * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul <manfred@colorfullife.com> * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> * Further wakeup optimizations, documentation * (c) 2010 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * Implementation notes: (May 2010) * This file implements System V semaphores. * * User space visible behavior: * - FIFO ordering for semop() operations (just FIFO, not starvation * protection) * - multiple semaphore operations that alter the same semaphore in * one semop() are handled. * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and * SETALL calls. * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. * - undo adjustments at process exit are limited to 0..SEMVMX. * - namespace are supported. * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing * to /proc/sys/kernel/sem. * - statistics about the usage are reported in /proc/sysvipc/sem. * * Internals: * - scalability: * - all global variables are read-mostly. * - semop() calls and semctl(RMID) are synchronized by RCU. * - most operations do write operations (actually: spin_lock calls) to * the per-semaphore array structure. * Thus: Perfect SMP scaling between independent semaphore arrays. * If multiple semaphores in one array are used, then cache line * trashing on the semaphore array spinlock will limit the scaling. * - semncnt and semzcnt are calculated on demand in count_semcnt() * - the task that performs a successful semop() scans the list of all * sleeping tasks and completes any pending operations that can be fulfilled. * Semaphores are actively given to waiting tasks (necessary for FIFO). * (see update_queue()) * - To improve the scalability, the actual wake-up calls are performed after * dropping all locks. (see wake_up_sem_queue_prepare()) * - All work is done by the waker, the woken up task does not have to do * anything - not even acquiring a lock or dropping a refcount. * - A woken up task may not even touch the semaphore array anymore, it may * have been destroyed already by a semctl(RMID). * - UNDO values are stored in an array (one per process and per * semaphore array, lazily allocated). For backwards compatibility, multiple * modes for the UNDO variables are supported (per process, per thread) * (see copy_semundo, CLONE_SYSVSEM) * - There are two lists of the pending operations: a per-array list * and per-semaphore list (stored in the array). This allows to achieve FIFO * ordering without always scanning all pending operations. * The worst-case behavior is nevertheless O(N^2) for N wakeups. */ #include <linux/compat.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/time.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/sched/wake_q.h> #include <linux/nospec.h> #include <linux/rhashtable.h> #include <linux/uaccess.h> #include "util.h" /* One semaphore structure for each semaphore in the system. */ struct sem { int semval; /* current value */ /* * PID of the process that last modified the semaphore. For * Linux, specifically these are: * - semop * - semctl, via SETVAL and SETALL. * - at task exit when performing undo adjustments (see exit_sem). */ struct pid *sempid; spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head pending_alter; /* pending single-sop operations */ /* that alter the semaphore */ struct list_head pending_const; /* pending single-sop operations */ /* that do not alter the semaphore*/ time64_t sem_otime; /* candidate for sem_otime */ } ____cacheline_aligned_in_smp; /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ time64_t sem_ctime; /* create/last semctl() time */ struct list_head pending_alter; /* pending operations */ /* that alter the array */ struct list_head pending_const; /* pending complex operations */ /* that do not alter semvals */ struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ struct sem sems[]; } __randomize_layout; /* One queue for each sleeping process in the system. */ struct sem_queue { struct list_head list; /* queue of pending operations */ struct task_struct *sleeper; /* this process */ struct sem_undo *undo; /* undo structure */ struct pid *pid; /* process id of requesting process */ int status; /* completion status of operation */ struct sembuf *sops; /* array of pending operations */ struct sembuf *blocking; /* the operation that blocked */ int nsops; /* number of operations */ bool alter; /* does *sops alter the array? */ bool dupsop; /* sops on more than one sem_num */ }; /* Each task has a list of undo requests. They are executed automatically * when the process exits. */ struct sem_undo { struct list_head list_proc; /* per-process list: * * all undos from one process * rcu protected */ struct rcu_head rcu; /* rcu struct for sem_undo */ struct sem_undo_list *ulp; /* back ptr to sem_undo_list */ struct list_head list_id; /* per semaphore array list: * all undos for one array */ int semid; /* semaphore set identifier */ short semadj[]; /* array of adjustments */ /* one per semaphore */ }; /* sem_undo_list controls shared access to the list of sem_undo structures * that may be shared among all a CLONE_SYSVSEM task group. */ struct sem_undo_list { refcount_t refcnt; spinlock_t lock; struct list_head list_proc; }; #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) static int newary(struct ipc_namespace *, struct ipc_params *); static void freeary(struct ipc_namespace *, struct kern_ipc_perm *); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif #define SEMMSL_FAST 256 /* 512 bytes on stack */ #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ /* * Switching from the mode suitable for simple ops * to the mode for complex ops is costly. Therefore: * use some hysteresis */ #define USE_GLOBAL_LOCK_HYSTERESIS 10 /* * Locking: * a) global sem_lock() for read/write * sem_undo.id_next, * sem_array.complex_count, * sem_array.pending{_alter,_const}, * sem_array.sem_undo * * b) global or semaphore sem_lock() for read/write: * sem_array.sems[i].pending_{const,alter}: * * c) special: * sem_undo_list.list_proc: * * undo_list->lock for write * * rcu for read * use_global_lock: * * global sem_lock() for write * * either local or global sem_lock() for read. * * Memory ordering: * Most ordering is enforced by using spin_lock() and spin_unlock(). * * Exceptions: * 1) use_global_lock: (SEM_BARRIER_1) * Setting it from non-zero to 0 is a RELEASE, this is ensured by * using smp_store_release(): Immediately after setting it to 0, * a simple op can start. * Testing if it is non-zero is an ACQUIRE, this is ensured by using * smp_load_acquire(). * Setting it from 0 to non-zero must be ordered with regards to * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. * To prevent the compiler/cpu temporarily writing 0 to use_global_lock, * READ_ONCE()/WRITE_ONCE() is used. * * 2) queue.status: (SEM_BARRIER_2) * Initialization is done while holding sem_lock(), so no further barrier is * required. * Setting it to a result code is a RELEASE, this is ensured by both a * smp_store_release() (for case a) and while holding sem_lock() * (for case b). * The ACQUIRE when reading the result code without holding sem_lock() is * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). * (case a above). * Reading the result code while holding sem_lock() needs no further barriers, * the locks inside sem_lock() enforce ordering (case b above) * * 3) current->state: * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock(). * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may * happen immediately after calling wake_q_add. As wake_q_add_safe() is called * when holding sem_lock(), no further barriers are required. * * See also ipc/mqueue.c for more details on the covered races. */ #define sc_semmsl sem_ctls[0] #define sc_semmns sem_ctls[1] #define sc_semopm sem_ctls[2] #define sc_semmni sem_ctls[3] void sem_init_ns(struct ipc_namespace *ns) { ns->sc_semmsl = SEMMSL; ns->sc_semmns = SEMMNS; ns->sc_semopm = SEMOPM; ns->sc_semmni = SEMMNI; ns->used_sems = 0; ipc_init_ids(&ns->ids[IPC_SEM_IDS]); } #ifdef CONFIG_IPC_NS void sem_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &sem_ids(ns), freeary); idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht); } #endif void __init sem_init(void) { sem_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", IPC_SEM_IDS, sysvipc_sem_proc_show); } /** * unmerge_queues - unmerge queues, if possible. * @sma: semaphore array * * The function unmerges the wait queues if complex_count is 0. * It must be called prior to dropping the global semaphore array lock. */ static void unmerge_queues(struct sem_array *sma) { struct sem_queue *q, *tq; /* complex operations still around? */ if (sma->complex_count) return; /* * We will switch back to simple mode. * Move all pending operation back into the per-semaphore * queues. */ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { struct sem *curr; curr = &sma->sems[q->sops[0].sem_num]; list_add_tail(&q->list, &curr->pending_alter); } INIT_LIST_HEAD(&sma->pending_alter); } /** * merge_queues - merge single semop queues into global queue * @sma: semaphore array * * This function merges all per-semaphore queues into the global queue. * It is necessary to achieve FIFO ordering for the pending single-sop * operations when a multi-semop operation must sleep. * Only the alter operations must be moved, the const operations can stay. */ static void merge_queues(struct sem_array *sma) { int i; for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = &sma->sems[i]; list_splice_init(&sem->pending_alter, &sma->pending_alter); } } static void sem_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); struct sem_array *sma = container_of(p, struct sem_array, sem_perm); security_sem_free(&sma->sem_perm); kvfree(sma); } /* * Enter the mode suitable for non-simple operations: * Caller must own sem_perm.lock. */ static void complexmode_enter(struct sem_array *sma) { int i; struct sem *sem; if (sma->use_global_lock > 0) { /* * We are already in global lock mode. * Nothing to do, just reset the * counter until we return to simple mode. */ WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); return; } WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); for (i = 0; i < sma->sem_nsems; i++) { sem = &sma->sems[i]; spin_lock(&sem->lock); spin_unlock(&sem->lock); } } /* * Try to leave the mode that disallows simple operations: * Caller must own sem_perm.lock. */ static void complexmode_tryleave(struct sem_array *sma) { if (sma->complex_count) { /* Complex ops are sleeping. * We must stay in complex mode */ return; } if (sma->use_global_lock == 1) { /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { WRITE_ONCE(sma->use_global_lock, sma->use_global_lock-1); } } #define SEM_GLOBAL_LOCK (-1) /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. * Otherwise, lock the entire semaphore array, since we either have * multiple semaphores in our own semops, or we need to look at * semaphores from other pending complex operations. */ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { struct sem *sem; int idx; if (nsops != 1) { /* Complex operation - acquire a full lock */ ipc_lock_object(&sma->sem_perm); /* Prevent parallel simple ops */ complexmode_enter(sma); return SEM_GLOBAL_LOCK; } /* * Only one semaphore affected - try to optimize locking. * Optimized locking is possible if no complex operation * is either enqueued or processed right now. * * Both facts are tracked by use_global_mode. */ idx = array_index_nospec(sops->sem_num, sma->sem_nsems); sem = &sma->sems[idx]; /* * Initial check for use_global_lock. Just an optimization, * no locking, no memory barrier. */ if (!READ_ONCE(sma->use_global_lock)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); /* see SEM_BARRIER_1 for purpose/pairing */ if (!smp_load_acquire(&sma->use_global_lock)) { /* fast path successful! */ return sops->sem_num; } spin_unlock(&sem->lock); } /* slow path: acquire the full lock */ ipc_lock_object(&sma->sem_perm); if (sma->use_global_lock == 0) { /* * The use_global_lock mode ended while we waited for * sma->sem_perm.lock. Thus we must switch to locking * with sem->lock. * Unlike in the fast path, there is no need to recheck * sma->use_global_lock after we have acquired sem->lock: * We own sma->sem_perm.lock, thus use_global_lock cannot * change. */ spin_lock(&sem->lock); ipc_unlock_object(&sma->sem_perm); return sops->sem_num; } else { /* * Not a false alarm, thus continue to use the global lock * mode. No need for complexmode_enter(), this was done by * the caller that has set use_global_mode to non-zero. */ return SEM_GLOBAL_LOCK; } } static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == SEM_GLOBAL_LOCK) { unmerge_queues(sma); complexmode_tryleave(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = &sma->sems[locknum]; spin_unlock(&sem->lock); } } /* * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. */ static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) { ipc_rmid(&sem_ids(ns), &s->sem_perm); } static struct sem_array *sem_alloc(size_t nsems) { struct sem_array *sma; if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; sma = kvzalloc_flex(*sma, sems, nsems, GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; return sma; } /** * newary - Create a new semaphore set * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) { int retval; struct sem_array *sma; key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; int i; if (!nsems) return -EINVAL; if (ns->used_sems + nsems > ns->sc_semmns) return -ENOSPC; sma = sem_alloc(nsems); if (!sma) return -ENOMEM; sma->sem_perm.mode = (semflg & S_IRWXUGO); sma->sem_perm.key = key; sma->sem_perm.security = NULL; retval = security_sem_alloc(&sma->sem_perm); if (retval) { kvfree(sma); return retval; } for (i = 0; i < nsems; i++) { INIT_LIST_HEAD(&sma->sems[i].pending_alter); INIT_LIST_HEAD(&sma->sems[i].pending_const); spin_lock_init(&sma->sems[i].lock); } sma->complex_count = 0; sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; INIT_LIST_HEAD(&sma->pending_alter); INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = ktime_get_real_seconds(); /* ipc_addid() locks sma upon success. */ retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (retval < 0) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return retval; } ns->used_sems += nsems; sem_unlock(sma, -1); rcu_read_unlock(); return sma->sem_perm.id; } /* * Called with sem_ids.rwsem and ipcp locked. */ static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct sem_array *sma; sma = container_of(ipcp, struct sem_array, sem_perm); if (params->u.nsems > sma->sem_nsems) return -EINVAL; return 0; } long ksys_semget(key_t key, int nsems, int semflg) { struct ipc_namespace *ns; static const struct ipc_ops sem_ops = { .getnew = newary, .associate = security_sem_associate, .more_checks = sem_more_checks, }; struct ipc_params sem_params; ns = current->nsproxy->ipc_ns; if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; sem_params.key = key; sem_params.flg = semflg; sem_params.u.nsems = nsems; return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) { return ksys_semget(key, nsems, semflg); } /** * perform_atomic_semop[_slow] - Attempt to perform semaphore * operations on a given array. * @sma: semaphore array * @q: struct sem_queue that describes the operation * * Caller blocking are as follows, based the value * indicated by the semaphore operation (sem_op): * * (1) >0 never blocks. * (2) 0 (wait-for-zero operation): semval is non-zero. * (3) <0 attempting to decrement semval to a value smaller than zero. * * Returns 0 if the operation was possible. * Returns 1 if the operation is impossible, the caller must sleep. * Returns <0 for error codes. */ static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops; struct pid *pid; struct sembuf *sop; struct sem *curr; struct sembuf *sops; struct sem_undo *un; sops = q->sops; nsops = q->nsops; un = q->undo; for (sop = sops; sop < sops + nsops; sop++) { int idx = array_index_nospec(sop->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; sem_op = sop->sem_op; result = curr->semval; if (!sem_op && result) goto would_block; result += sem_op; if (result < 0) goto would_block; if (result > SEMVMX) goto out_of_range; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) goto out_of_range; un->semadj[sop->sem_num] = undo; } curr->semval = result; } sop--; pid = q->pid; while (sop >= sops) { ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid); sop--; } return 0; out_of_range: result = -ERANGE; goto undo; would_block: q->blocking = sop; if (sop->sem_flg & IPC_NOWAIT) result = -EAGAIN; else result = 1; undo: sop--; while (sop >= sops) { sem_op = sop->sem_op; sma->sems[sop->sem_num].semval -= sem_op; if (sop->sem_flg & SEM_UNDO) un->semadj[sop->sem_num] += sem_op; sop--; } return result; } static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops; struct sembuf *sop; struct sem *curr; struct sembuf *sops; struct sem_undo *un; sops = q->sops; nsops = q->nsops; un = q->undo; if (unlikely(q->dupsop)) return perform_atomic_semop_slow(sma, q); /* * We scan the semaphore set twice, first to ensure that the entire * operation can succeed, therefore avoiding any pointless writes * to shared memory and having to undo such changes in order to block * until the operations can go through. */ for (sop = sops; sop < sops + nsops; sop++) { int idx = array_index_nospec(sop->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; sem_op = sop->sem_op; result = curr->semval; if (!sem_op && result) goto would_block; /* wait-for-zero */ result += sem_op; if (result < 0) goto would_block; if (result > SEMVMX) return -ERANGE; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) return -ERANGE; } } for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; un->semadj[sop->sem_num] = undo; } curr->semval += sem_op; ipc_update_pid(&curr->sempid, q->pid); } return 0; would_block: q->blocking = sop; return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; } static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, struct wake_q_head *wake_q) { struct task_struct *sleeper; sleeper = get_task_struct(q->sleeper); /* see SEM_BARRIER_2 for purpose/pairing */ smp_store_release(&q->status, error); wake_q_add_safe(wake_q, sleeper); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) { list_del(&q->list); if (q->nsops > 1) sma->complex_count--; } /** check_restart(sma, q) * @sma: semaphore array * @q: the operation that just completed * * update_queue is O(N^2) when it restarts scanning the whole queue of * waiting operations. Therefore this function checks if the restart is * really necessary. It is called after a previously waiting operation * modified the array. * Note that wait-for-zero operations are handled without restart. */ static inline int check_restart(struct sem_array *sma, struct sem_queue *q) { /* pending complex alter operations are too difficult to analyse */ if (!list_empty(&sma->pending_alter)) return 1; /* we were a sleeping complex operation. Too difficult */ if (q->nsops > 1) return 1; /* It is impossible that someone waits for the new value: * - complex operations always restart. * - wait-for-zero are handled separately. * - q is a previously sleeping simple operation that * altered the array. It must be a decrement, because * simple increments never sleep. * - If there are older (higher priority) decrements * in the queue, then they have observed the original * semval value and couldn't proceed. The operation * decremented to value - thus they won't proceed either. */ return 0; } /** * wake_const_ops - wake up non-alter tasks * @sma: semaphore array. * @semnum: semaphore that was modified. * @wake_q: lockless wake-queue head. * * wake_const_ops must be called after a semaphore in a semaphore array * was set to 0. If complex const operations are pending, wake_const_ops must * be called with semnum = -1, as well as with the number of each modified * semaphore. * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function returns 1 if at least one operation was completed successfully. */ static int wake_const_ops(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; if (semnum == -1) pending_list = &sma->pending_const; else pending_list = &sma->sems[semnum].pending_const; list_for_each_entry_safe(q, tmp, pending_list, list) { int error = perform_atomic_semop(sma, q); if (error > 0) continue; /* operation completed, remove from queue & wakeup */ unlink_queue(sma, q); wake_up_sem_queue_prepare(q, error, wake_q); if (error == 0) semop_completed = 1; } return semop_completed; } /** * do_smart_wakeup_zero - wakeup all wait for zero tasks * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations * @wake_q: lockless wake-queue head * * Checks all required queue for wait-for-zero operations, based * on the actual changes that were performed on the semaphore array. * The function returns 1 if at least one operation was completed successfully. */ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, int nsops, struct wake_q_head *wake_q) { int i; int semop_completed = 0; int got_zero = 0; /* first: the per-semaphore queues, if known */ if (sops) { for (i = 0; i < nsops; i++) { int num = sops[i].sem_num; if (sma->sems[num].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, num, wake_q); } } } else { /* * No sops means modified semaphores not known. * Assume all were changed. */ for (i = 0; i < sma->sem_nsems; i++) { if (sma->sems[i].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, i, wake_q); } } } /* * If one of the modified semaphores got 0, * then check the global queue, too. */ if (got_zero) semop_completed |= wake_const_ops(sma, -1, wake_q); return semop_completed; } /** * update_queue - look for tasks that can be completed. * @sma: semaphore array. * @semnum: semaphore that was modified. * @wake_q: lockless wake-queue head. * * update_queue must be called after a semaphore in a semaphore array * was modified. If multiple semaphores were modified, update_queue must * be called with semnum = -1, as well as with the number of each modified * semaphore. * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function internally checks if const operations can now succeed. * * The function return 1 if at least one semop was completed successfully. */ static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; if (semnum == -1) pending_list = &sma->pending_alter; else pending_list = &sma->sems[semnum].pending_alter; again: list_for_each_entry_safe(q, tmp, pending_list, list) { int error, restart; /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not * necessary to scan further: simple increments * that affect only one entry succeed immediately and cannot * be in the per semaphore pending queue, and decrements * cannot be successful if the value is already 0. */ if (semnum != -1 && sma->sems[semnum].semval == 0) break; error = perform_atomic_semop(sma, q); /* Does q->sleeper still need to sleep? */ if (error > 0) continue; unlink_queue(sma, q); if (error) { restart = 0; } else { semop_completed = 1; do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); restart = check_restart(sma, q); } wake_up_sem_queue_prepare(q, error, wake_q); if (restart) goto again; } return semop_completed; } /** * set_semotime - set sem_otime * @sma: semaphore array * @sops: operations that modified the array, may be NULL * * sem_otime is replicated to avoid cache line trashing. * This function sets one instance to the current time. */ static void set_semotime(struct sem_array *sma, struct sembuf *sops) { if (sops == NULL) { sma->sems[0].sem_otime = ktime_get_real_seconds(); } else { sma->sems[sops[0].sem_num].sem_otime = ktime_get_real_seconds(); } } /** * do_smart_update - optimized update_queue * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations * @otime: force setting otime * @wake_q: lockless wake-queue head * * do_smart_update() does the required calls to update_queue and wakeup_zero, * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is * responsible for calling wake_up_q(). * It is safe to perform this call after dropping all locks. */ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, int otime, struct wake_q_head *wake_q) { int i; otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); if (!list_empty(&sma->pending_alter)) { /* semaphore array uses the global queue - just process it. */ otime |= update_queue(sma, -1, wake_q); } else { if (!sops) { /* * No sops, thus the modified semaphores are not * known. Check all. */ for (i = 0; i < sma->sem_nsems; i++) otime |= update_queue(sma, i, wake_q); } else { /* * Check the semaphores that were increased: * - No complex ops, thus all sleeping ops are * decrease. * - if we decreased the value, then any sleeping * semaphore ops won't be able to run: If the * previous value was too small, then the new * value will be too small, too. */ for (i = 0; i < nsops; i++) { if (sops[i].sem_op > 0) { otime |= update_queue(sma, sops[i].sem_num, wake_q); } } } } if (otime) set_semotime(sma, sops); } /* * check_qop: Test if a queued operation sleeps on the semaphore semnum */ static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q, bool count_zero) { struct sembuf *sop = q->blocking; /* * Linux always (since 0.99.10) reported a task as sleeping on all * semaphores. This violates SUS, therefore it was changed to the * standard compliant behavior. * Give the administrators a chance to notice that an application * might misbehave because it relies on the Linux behavior. */ pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n" "The task %s (%d) triggered the difference, watch for misbehavior.\n", current->comm, task_pid_nr(current)); if (sop->sem_num != semnum) return 0; if (count_zero && sop->sem_op == 0) return 1; if (!count_zero && sop->sem_op < 0) return 1; return 0; } /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero * * Per definition, a task waits only on the semaphore of the first semop * that cannot proceed, even if additional operation would block, too. */ static int count_semcnt(struct sem_array *sma, ushort semnum, bool count_zero) { struct list_head *l; struct sem_queue *q; int semcnt; semcnt = 0; /* First: check the simple operations. They are easy to evaluate */ if (count_zero) l = &sma->sems[semnum].pending_const; else l = &sma->sems[semnum].pending_alter; list_for_each_entry(q, l, list) { /* all task on a per-semaphore list sleep on exactly * that semaphore */ semcnt++; } /* Then: check the complex operations. */ list_for_each_entry(q, &sma->pending_alter, list) { semcnt += check_qop(sma, semnum, q, count_zero); } if (count_zero) { list_for_each_entry(q, &sma->pending_const, list) { semcnt += check_qop(sma, semnum, q, count_zero); } } return semcnt; } /* Free a semaphore set. freeary() is called with sem_ids.rwsem locked * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct sem_undo *un, *tu; struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); int i; DEFINE_WAKE_Q(wake_q); /* Free the existing undo structures for this semaphore set. */ ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry_safe(un, tu, &sma->list_id, list_id) { list_del(&un->list_id); spin_lock(&un->ulp->lock); un->semid = -1; list_del_rcu(&un->list_proc); spin_unlock(&un->ulp->lock); kvfree_rcu(un, rcu); } /* Wake up all pending processes and let them fail with EIDRM. */ list_for_each_entry_safe(q, tq, &sma->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = &sma->sems[i]; list_for_each_entry_safe(q, tq, &sem->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } ipc_update_pid(&sem->sempid, NULL); } /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); ns->used_sems -= sma->sem_nsems; ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct semid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm); out.sem_otime = in->sem_otime; out.sem_ctime = in->sem_ctime; out.sem_nsems = in->sem_nsems; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static time64_t get_semotime(struct sem_array *sma) { int i; time64_t res; res = sma->sems[0].sem_otime; for (i = 1; i < sma->sem_nsems; i++) { time64_t to = sma->sems[i].sem_otime; if (to > res) res = to; } return res; } static int semctl_stat(struct ipc_namespace *ns, int semid, int cmd, struct semid64_ds *semid64) { struct sem_array *sma; time64_t semotime; int err; memset(semid64, 0, sizeof(*semid64)); rcu_read_lock(); if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) { sma = sem_obtain_object(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } } /* see comment for SHM_STAT_ANY */ if (cmd == SEM_STAT_ANY) audit_ipc_obj(&sma->sem_perm); else { err = -EACCES; if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) goto out_unlock; } err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&sma->sem_perm); if (!ipc_valid_object(&sma->sem_perm)) { ipc_unlock_object(&sma->sem_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm); semotime = get_semotime(sma); semid64->sem_otime = semotime; semid64->sem_ctime = sma->sem_ctime; #ifndef CONFIG_64BIT semid64->sem_otime_high = semotime >> 32; semid64->sem_ctime_high = sma->sem_ctime >> 32; #endif semid64->sem_nsems = sma->sem_nsems; if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * SEM_STAT and SEM_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = sma->sem_perm.id; } ipc_unlock_object(&sma->sem_perm); out_unlock: rcu_read_unlock(); return err; } static int semctl_info(struct ipc_namespace *ns, int semid, int cmd, void __user *p) { struct seminfo seminfo; int max_idx; int err; err = security_sem_semctl(NULL, cmd); if (err) return err; memset(&seminfo, 0, sizeof(seminfo)); seminfo.semmni = ns->sc_semmni; seminfo.semmns = ns->sc_semmns; seminfo.semmsl = ns->sc_semmsl; seminfo.semopm = ns->sc_semopm; seminfo.semvmx = SEMVMX; seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; } else { seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } max_idx = ipc_get_maxidx(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_idx < 0) ? 0 : max_idx; } static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, int val) { struct sem_undo *un; struct sem_array *sma; struct sem *curr; int err; DEFINE_WAKE_Q(wake_q); if (val > SEMVMX || val < 0) return -ERANGE; rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return PTR_ERR(sma); } if (semnum < 0 || semnum >= sma->sem_nsems) { rcu_read_unlock(); return -EINVAL; } if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) { rcu_read_unlock(); return -EACCES; } err = security_sem_semctl(&sma->sem_perm, SETVAL); if (err) { rcu_read_unlock(); return -EACCES; } sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); return -EIDRM; } semnum = array_index_nospec(semnum, sma->sem_nsems); curr = &sma->sems[semnum]; ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; curr->semval = val; ipc_update_pid(&curr->sempid, task_tgid(current)); sma->sem_ctime = ktime_get_real_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); return 0; } static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int cmd, void __user *p) { struct sem_array *sma; struct sem *curr; int err, nsems; ushort fast_sem_io[SEMMSL_FAST]; ushort *sem_io = fast_sem_io; DEFINE_WAKE_Q(wake_q); rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return PTR_ERR(sma); } nsems = sma->sem_nsems; err = -EACCES; if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO)) goto out_rcu_wakeup; err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_rcu_wakeup; switch (cmd) { case GETALL: { ushort __user *array = p; int i; sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } if (nsems > SEMMSL_FAST) { if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); sem_io = kvmalloc_array(nsems, sizeof(ushort), GFP_KERNEL); if (sem_io == NULL) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } } for (i = 0; i < sma->sem_nsems; i++) sem_io[i] = sma->sems[i].semval; sem_unlock(sma, -1); rcu_read_unlock(); err = 0; if (copy_to_user(array, sem_io, nsems*sizeof(ushort))) err = -EFAULT; goto out_free; } case SETALL: { int i; struct sem_undo *un; if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_rcu_wakeup; } rcu_read_unlock(); if (nsems > SEMMSL_FAST) { sem_io = kvmalloc_array(nsems, sizeof(ushort), GFP_KERNEL); if (sem_io == NULL) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } } if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -ERANGE; goto out_free; } } rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } for (i = 0; i < nsems; i++) { sma->sems[i].semval = sem_io[i]; ipc_update_pid(&sma->sems[i].sempid, task_tgid(current)); } ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) { for (i = 0; i < nsems; i++) un->semadj[i] = 0; } sma->sem_ctime = ktime_get_real_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &wake_q); err = 0; goto out_unlock; } /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */ } err = -EINVAL; if (semnum < 0 || semnum >= nsems) goto out_rcu_wakeup; sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } semnum = array_index_nospec(semnum, nsems); curr = &sma->sems[semnum]; switch (cmd) { case GETVAL: err = curr->semval; goto out_unlock; case GETPID: err = pid_vnr(curr->sempid); goto out_unlock; case GETNCNT: err = count_semcnt(sma, semnum, 0); goto out_unlock; case GETZCNT: err = count_semcnt(sma, semnum, 1); goto out_unlock; } out_unlock: sem_unlock(sma, -1); out_rcu_wakeup: rcu_read_unlock(); wake_up_q(&wake_q); out_free: if (sem_io != fast_sem_io) kvfree(sem_io); return err; } static inline unsigned long copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct semid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->sem_perm.uid = tbuf_old.sem_perm.uid; out->sem_perm.gid = tbuf_old.sem_perm.gid; out->sem_perm.mode = tbuf_old.sem_perm.mode; return 0; } default: return -EINVAL; } } /* * This function handles some semctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, struct semid64_ds *semid64) { struct sem_array *sma; int err; struct kern_ipc_perm *ipcp; down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd, &semid64->sem_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: sem_lock(sma, NULL, -1); /* freeary unlocks the ipc object and rcu */ freeary(ns, ipcp); goto out_up; case IPC_SET: sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64->sem_perm, ipcp); if (err) goto out_unlock0; sma->sem_ctime = ktime_get_real_seconds(); break; default: err = -EINVAL; goto out_unlock1; } out_unlock0: sem_unlock(sma, -1); out_unlock1: rcu_read_unlock(); out_up: up_write(&sem_ids(ns).rwsem); return err; } static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version) { struct ipc_namespace *ns; void __user *p = (void __user *)arg; struct semid64_ds semid64; int err; if (semid < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case SEM_INFO: return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; if (copy_semid_to_user(p, &semid64, version)) err = -EFAULT; return err; case GETALL: case GETVAL: case GETPID: case GETNCNT: case GETZCNT: case SETALL: return semctl_main(ns, semid, semnum, cmd, p); case SETVAL: { int val; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) /* big-endian 64bit */ val = arg >> 32; #else /* 32bit or little-endian 64bit */ val = arg; #endif return semctl_setval(ns, semid, semnum, val); } case IPC_SET: if (copy_semid_from_user(&semid64, p, version)) return -EFAULT; fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: return -EINVAL; } } SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { return ksys_semctl(semid, semnum, cmd, arg, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg) { int version = ipc_parse_version(&cmd); return ksys_semctl(semid, semnum, cmd, arg, version); } SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { return ksys_old_semctl(semid, semnum, cmd, arg); } #endif #ifdef CONFIG_COMPAT struct compat_semid_ds { struct compat_ipc_perm sem_perm; old_time32_t sem_otime; old_time32_t sem_ctime; compat_uptr_t sem_base; compat_uptr_t sem_pending; compat_uptr_t sem_pending_last; compat_uptr_t undo; unsigned short sem_nsems; }; static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_semid64_ds __user *p = buf; return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm); } else { struct compat_semid_ds __user *p = buf; return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm); } } static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in, int version) { if (version == IPC_64) { struct compat_semid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm); v.sem_otime = lower_32_bits(in->sem_otime); v.sem_otime_high = upper_32_bits(in->sem_otime); v.sem_ctime = lower_32_bits(in->sem_ctime); v.sem_ctime_high = upper_32_bits(in->sem_ctime); v.sem_nsems = in->sem_nsems; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_semid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.sem_perm, &in->sem_perm); v.sem_otime = in->sem_otime; v.sem_ctime = in->sem_ctime; v.sem_nsems = in->sem_nsems; return copy_to_user(buf, &v, sizeof(v)); } } static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version) { void __user *p = compat_ptr(arg); struct ipc_namespace *ns; struct semid64_ds semid64; int err; ns = current->nsproxy->ipc_ns; if (semid < 0) return -EINVAL; switch (cmd & (~IPC_64)) { case IPC_INFO: case SEM_INFO: return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; if (copy_compat_semid_to_user(p, &semid64, version)) err = -EFAULT; return err; case GETVAL: case GETPID: case GETNCNT: case GETZCNT: case GETALL: case SETALL: return semctl_main(ns, semid, semnum, cmd, p); case SETVAL: return semctl_setval(ns, semid, semnum, arg); case IPC_SET: if (copy_compat_semid_from_user(&semid64, p, version)) return -EFAULT; fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: return -EINVAL; } } COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg) { return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_semctl(semid, semnum, cmd, arg, version); } COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg) { return compat_ksys_old_semctl(semid, semnum, cmd, arg); } #endif #endif /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, * and current is THE ONE * * If this allocation and assignment succeeds, but later * portions of this code fail, there is no need to free the sem_undo_list. * Just let it stay associated with the task, and it'll be freed later * at exit time. * * This can block, so callers must hold no locks. */ static inline int get_undo_list(struct sem_undo_list **undo_listp) { struct sem_undo_list *undo_list; undo_list = current->sysvsem.undo_list; if (!undo_list) { undo_list = kzalloc_obj(*undo_list, GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); refcount_set(&undo_list->refcnt, 1); INIT_LIST_HEAD(&undo_list->list_proc); current->sysvsem.undo_list = undo_list; } *undo_listp = undo_list; return 0; } static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } return NULL; } static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; assert_spin_locked(&ulp->lock); un = __lookup_undo(ulp, semid); if (un) { list_del_rcu(&un->list_proc); list_add_rcu(&un->list_proc, &ulp->list_proc); } return un; } /** * find_alloc_undo - lookup (and if not present create) undo array * @ns: namespace * @semid: semaphore array id * * The function looks up (and if not present creates) the undo structure. * The size of the undo structure depends on the size of the semaphore * array, thus the alloc path is not that straightforward. * Lifetime-rules: sem_undo is rcu-protected, on success, the function * performs a rcu_read_lock(). */ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) { struct sem_array *sma; struct sem_undo_list *ulp; struct sem_undo *un, *new; int nsems, error; error = get_undo_list(&ulp); if (error) return ERR_PTR(error); rcu_read_lock(); spin_lock(&ulp->lock); un = lookup_undo(ulp, semid); spin_unlock(&ulp->lock); if (likely(un != NULL)) goto out; /* no undo structure around - allocate one. */ /* step 1: figure out the size of the semaphore array */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return ERR_CAST(sma); } nsems = sma->sem_nsems; if (!ipc_rcu_getref(&sma->sem_perm)) { rcu_read_unlock(); un = ERR_PTR(-EIDRM); goto out; } rcu_read_unlock(); /* step 2: allocate new undo structure */ new = kvzalloc_flex(*new, semadj, nsems, GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); } /* step 3: Acquire the lock on semaphore array */ rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); kvfree(new); un = ERR_PTR(-EIDRM); goto out; } spin_lock(&ulp->lock); /* * step 4: check for races: did someone else allocate the undo struct? */ un = lookup_undo(ulp, semid); if (un) { spin_unlock(&ulp->lock); kvfree(new); goto success; } /* step 5: initialize & link new undo structure */ new->ulp = ulp; new->semid = semid; assert_spin_locked(&ulp->lock); list_add_rcu(&new->list_proc, &ulp->list_proc); ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; spin_unlock(&ulp->lock); success: sem_unlock(sma, -1); out: return un; } long __do_semtimedop(int semid, struct sembuf *sops, unsigned nsops, const struct timespec64 *timeout, struct ipc_namespace *ns) { int error = -EINVAL; struct sem_array *sma; struct sembuf *sop; struct sem_undo *un; int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; unsigned long dup = 0; ktime_t expires, *exp = NULL; bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; if (nsops > ns->sc_semopm) return -E2BIG; if (timeout) { if (!timespec64_valid(timeout)) return -EINVAL; expires = ktime_add_safe(ktime_get(), timespec64_to_ktime(*timeout)); exp = &expires; } max = 0; for (sop = sops; sop < sops + nsops; sop++) { unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); if (sop->sem_num >= max) max = sop->sem_num; if (sop->sem_flg & SEM_UNDO) undos = true; if (dup & mask) { /* * There was a previous alter access that appears * to have accessed the same semaphore, thus use * the dupsop logic. "appears", because the detection * can only check % BITS_PER_LONG. */ dupsop = true; } if (sop->sem_op != 0) { alter = true; dup |= mask; } } if (undos) { /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out; } } else { un = NULL; rcu_read_lock(); } sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); error = PTR_ERR(sma); goto out; } error = -EFBIG; if (max >= sma->sem_nsems) { rcu_read_unlock(); goto out; } error = -EACCES; if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { rcu_read_unlock(); goto out; } error = security_sem_semop(&sma->sem_perm, sops, nsops, alter); if (error) { rcu_read_unlock(); goto out; } error = -EIDRM; locknum = sem_lock(sma, sops, nsops); /* * We eventually might perform the following check in a lockless * fashion, considering ipc_valid_object() locking constraints. * If nsops == 1 and there is no contention for sem_perm.lock, then * only a per-semaphore lock is held and it's OK to proceed with the * check below. More details on the fine grained locking scheme * entangled here and why it's RMID race safe on comments at sem_lock() */ if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID * and now a new array with received the same id. Check and fail. * This case can be detected checking un->semid. The existence of * "un" itself is guaranteed by rcu. */ if (un && un->semid == -1) goto out_unlock; queue.sops = sops; queue.nsops = nsops; queue.undo = un; queue.pid = task_tgid(current); queue.alter = alter; queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); if (error == 0) { /* non-blocking successful path */ DEFINE_WAKE_Q(wake_q); /* * If the operation was successful, then do * the required updates. */ if (alter) do_smart_update(sma, sops, nsops, 1, &wake_q); else set_semotime(sma, sops); sem_unlock(sma, locknum); rcu_read_unlock(); wake_up_q(&wake_q); goto out; } if (error < 0) /* non-blocking error path */ goto out_unlock; /* * We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ if (nsops == 1) { struct sem *curr; int idx = array_index_nospec(sops->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; if (alter) { if (sma->complex_count) { list_add_tail(&queue.list, &sma->pending_alter); } else { list_add_tail(&queue.list, &curr->pending_alter); } } else { list_add_tail(&queue.list, &curr->pending_const); } } else { if (!sma->complex_count) merge_queues(sma); if (alter) list_add_tail(&queue.list, &sma->pending_alter); else list_add_tail(&queue.list, &sma->pending_const); sma->complex_count++; } do { /* memory ordering ensured by the lock in sem_lock() */ WRITE_ONCE(queue.status, -EINTR); queue.sleeper = current; /* memory ordering is ensured by the lock in sem_lock() */ __set_current_state(TASK_INTERRUPTIBLE); sem_unlock(sma, locknum); rcu_read_unlock(); timed_out = !schedule_hrtimeout_range(exp, current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or * not, from the syscall pov, is quite irrelevant to us at this * point; we're done. * * We _do_ care, nonetheless, about being awoken by a signal or * spuriously. The queue.status is checked again in the * slowpath (aka after taking sem_lock), such that we can detect * scenarios where we were awakened externally, during the * window between wake_q_add() and wake_up_q(). */ rcu_read_lock(); error = READ_ONCE(queue.status); if (error != -EINTR) { /* see SEM_BARRIER_2 for purpose/pairing */ smp_acquire__after_ctrl_dep(); rcu_read_unlock(); goto out; } locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock; /* * No necessity for any barrier: We are protect by sem_lock() */ error = READ_ONCE(queue.status); /* * If queue.status != -EINTR we are woken up by another process. * Leave without unlink_queue(), but with sem_unlock(). */ if (error != -EINTR) goto out_unlock; /* * If an interrupt occurred we have to clean up the queue. */ if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); out_unlock: sem_unlock(sma, locknum); rcu_read_unlock(); out: return error; } static long do_semtimedop(int semid, struct sembuf __user *tsops, unsigned nsops, const struct timespec64 *timeout) { struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf *sops = fast_sops; struct ipc_namespace *ns; int ret; ns = current->nsproxy->ipc_ns; if (nsops > ns->sc_semopm) return -E2BIG; if (nsops < 1) return -EINVAL; if (nsops > SEMOPM_FAST) { sops = kvmalloc_objs(*sops, nsops); if (sops == NULL) return -ENOMEM; } if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { ret = -EFAULT; goto out_free; } ret = __do_semtimedop(semid, sops, nsops, timeout, ns); out_free: if (sops != fast_sops) kvfree(sops); return ret; } long ksys_semtimedop(int semid, struct sembuf __user *tsops, unsigned int nsops, const struct __kernel_timespec __user *timeout) { if (timeout) { struct timespec64 ts; if (get_timespec64(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsops, nsops, &ts); } return do_semtimedop(semid, tsops, nsops, NULL); } SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned int, nsops, const struct __kernel_timespec __user *, timeout) { return ksys_semtimedop(semid, tsops, nsops, timeout); } #ifdef CONFIG_COMPAT_32BIT_TIME long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout) { if (timeout) { struct timespec64 ts; if (get_old_timespec32(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsems, nsops, &ts); } return do_semtimedop(semid, tsems, nsops, NULL); } SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems, unsigned int, nsops, const struct old_timespec32 __user *, timeout) { return compat_ksys_semtimedop(semid, tsems, nsops, timeout); } #endif SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, unsigned, nsops) { return do_semtimedop(semid, tsops, nsops, NULL); } /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between * parent and child tasks. */ int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; if (clone_flags & CLONE_SYSVSEM) { error = get_undo_list(&undo_list); if (error) return error; refcount_inc(&undo_list->refcnt); tsk->sysvsem.undo_list = undo_list; } else tsk->sysvsem.undo_list = NULL; return 0; } /* * add semadj values to semaphores, free undo structures. * undo structures are not freed when semaphore arrays are destroyed * so some of them may be out of date. * IMPLEMENTATION NOTE: There is some confusion over whether the * set of adjustments that needs to be done should be done in an atomic * manner or not. That is, if we are attempting to decrement the semval * should we queue up and wait until we can do so legally? * The original implementation attempted to do this (queue and wait). * The current implementation does not do so. The POSIX standard * and SVID should be consulted to determine what behavior is mandated. */ void exit_sem(struct task_struct *tsk) { struct sem_undo_list *ulp; ulp = tsk->sysvsem.undo_list; if (!ulp) return; tsk->sysvsem.undo_list = NULL; if (!refcount_dec_and_test(&ulp->refcnt)) return; for (;;) { struct sem_array *sma; struct sem_undo *un; int semid, i; DEFINE_WAKE_Q(wake_q); cond_resched(); rcu_read_lock(); un = list_entry_rcu(ulp->list_proc.next, struct sem_undo, list_proc); if (&un->list_proc == &ulp->list_proc) { /* * We must wait for freeary() before freeing this ulp, * in case we raced with last sem_undo. There is a small * possibility where we exit while freeary() didn't * finish unlocking sem_undo_list. */ spin_lock(&ulp->lock); spin_unlock(&ulp->lock); rcu_read_unlock(); break; } spin_lock(&ulp->lock); semid = un->semid; spin_unlock(&ulp->lock); /* exit_sem raced with IPC_RMID, nothing to do */ if (semid == -1) { rcu_read_unlock(); continue; } sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid); /* exit_sem raced with IPC_RMID, nothing to do */ if (IS_ERR(sma)) { rcu_read_unlock(); continue; } sem_lock(sma, NULL, -1); /* exit_sem raced with IPC_RMID, nothing to do */ if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); continue; } un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created * exactly the same semid. Nothing to do. */ sem_unlock(sma, -1); rcu_read_unlock(); continue; } /* remove un from the linked lists */ ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); spin_lock(&ulp->lock); list_del_rcu(&un->list_proc); spin_unlock(&ulp->lock); /* perform adjustments registered in un */ for (i = 0; i < sma->sem_nsems; i++) { struct sem *semaphore = &sma->sems[i]; if (un->semadj[i]) { semaphore->semval += un->semadj[i]; /* * Range checks of the new semaphore value, * not defined by sus: * - Some unices ignore the undo entirely * (e.g. HP UX 11i 11.22, Tru64 V5.1) * - some cap the value (e.g. FreeBSD caps * at 0, but doesn't enforce SEMVMX) * * Linux caps the semaphore value, both at 0 * and at SEMVMX. * * Manfred <manfred@colorfullife.com> */ if (semaphore->semval < 0) semaphore->semval = 0; if (semaphore->semval > SEMVMX) semaphore->semval = SEMVMX; ipc_update_pid(&semaphore->sempid, task_tgid(current)); } } /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 1, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); kvfree_rcu(un, rcu); } kfree(ulp); } #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it) { struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); time64_t sem_otime; /* * The proc interface isn't aware of sem_lock(), it calls * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock). * (in sysvipc_find_ipc) * In order to stay compatible with sem_lock(), we must * enter / leave complex_mode. */ complexmode_enter(sma); sem_otime = get_semotime(sma); seq_printf(s, "%10d %10d %4o %10u %5u %5u %5u %5u %10llu %10llu\n", sma->sem_perm.key, sma->sem_perm.id, sma->sem_perm.mode, sma->sem_nsems, from_kuid_munged(user_ns, sma->sem_perm.uid), from_kgid_munged(user_ns, sma->sem_perm.gid), from_kuid_munged(user_ns, sma->sem_perm.cuid), from_kgid_munged(user_ns, sma->sem_perm.cgid), sem_otime, sma->sem_ctime); complexmode_tryleave(sma); return 0; } #endif
85 365 409 13 1661 220 334 473 71 290 614 611 236 9 9 1547 2685 5036 4674 4671 4205 4218 4205 851 854 851 2939 267 270 6 6 6 25 25 25 2731 505 4312 2811 90 2047 62 1495 264 14 46 537 5 10 186 2 129 916 306 1211 168 1746 3013 8670 362 10022 6911 5676 275 68 78 4519 270 65 413 76 1629 1625 905 904 1274 986 1274 986 1292 8 31 10 2326 2144 2326 2144 637 93 1932 1148 1932 1148 466 1993 471 437 24 467 383 414 2822 4284 4045 331 4076 1 1 226 607 924 410 96 1891 707 25 1 40 1 26 1 26 3602 283 229 148 24 4020 763 1463 153 365 5 12 8 56 90 50 5 40 16 28 4 962 793 30 30 28 29 970 553 149 36 214 489 489 174 174 3500 117 36 1799 9 31 2647 139 36 36 36 32 6910 1975 822 821 522 2738 319 23 9 1622 1837 27 121 7710 342 7703 314 150 147 143 19 140 3 2 136 2 134 36 276 4 4 135 282 10 278 227 2 42 211 221 204 163 212 202 16 13 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/fs/super.h> #include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> #include <linux/file_ref.h> #include <linux/unicode.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct bdi_writeback; struct bio; struct io_comp_batch; struct fiemap_extent_info; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct iov_iter; struct fsnotify_mark_connector; struct fs_context; struct fs_parameter_spec; struct file_kattr; struct iomap_ops; struct delegated_inode; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) /* File supports atomic writes */ #define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) /* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* Supports IOCB_HAS_METADATA */ #define FMODE_HAS_METADATA ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) #define FMODE_OPENED ((__force fmode_t)(1 << 19)) #define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24)) /* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) /* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for an open file. These are the possible values of * (f->f_mode & FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only FAN_ACCESS_PERM. */ #define FMODE_FSNOTIFY_MASK \ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) #define FMODE_FSNOTIFY_NONE(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS #define FMODE_FSNOTIFY_HSM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) #define FMODE_FSNOTIFY_ACCESS_PERM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0) #else #define FMODE_FSNOTIFY_ACCESS_PERM(mode) 0 #define FMODE_FSNOTIFY_HSM(mode) 0 #endif /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_CTIME_SET (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts. */ union { kuid_t ia_uid; vfsuid_t ia_vfsuid; }; union { kgid_t ia_gid; vfsgid_t ia_vfsgid; }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE #define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 22) #define IOCB_HAS_METADATA (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ { IOCB_HIPRI, "HIPRI" }, \ { IOCB_DSYNC, "DSYNC" }, \ { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ { IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ { IOCB_AIO_RW, "AIO_RW" }, \ { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } struct kiocb { struct file *ki_filp; loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ u8 ki_write_stream; /* * Only used for async buffered reads, where it denotes the page * waitqueue associated with completing the read. * Valid IFF IOCB_WAITQ is set. */ struct wait_page_queue *ki_waitq; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } struct address_space_operations { int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; /* Structure for tracking metadata buffer heads associated with the mapping */ struct mapping_metadata_bhs { struct address_space *mapping; /* Mapping bhs are associated with */ spinlock_t lock; /* Lock protecting bh list */ struct list_head list; /* The list of bhs (b_assoc_buffers) */ }; /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; struct rw_semaphore i_mmap_rwsem; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct folio's "mapping" pointer be used for FOLIO_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_read(struct address_space *mapping) { return down_read_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(const struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(const struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 #define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock * * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync() (unless I_DIRTY_DATASYNC is also set). * Timestamp updates are the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already * in place because writeback might already be in progress * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait * upon. There's one free address left. */ enum inode_state_bits { __I_NEW = 0U, __I_SYNC = 1U, __I_LRU_ISOLATING = 2U /* reserved wait address bit 3 */ }; enum inode_state_flags_enum { I_NEW = (1U << __I_NEW), I_SYNC = (1U << __I_SYNC), I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), /* reserved flag bit 3 */ I_DIRTY_SYNC = (1U << 4), I_DIRTY_DATASYNC = (1U << 5), I_DIRTY_PAGES = (1U << 6), I_WILL_FREE = (1U << 7), I_FREEING = (1U << 8), I_CLEAR = (1U << 9), I_REFERENCED = (1U << 10), I_LINKABLE = (1U << 11), I_DIRTY_TIME = (1U << 12), I_WB_SWITCH = (1U << 13), I_OVL_INUSE = (1U << 14), I_CREATING = (1U << 15), I_DONTCACHE = (1U << 16), I_SYNC_QUEUED = (1U << 17), I_PINNING_NETFS_WB = (1U << 18) }; #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) /* * Use inode_state_read() & friends to access. */ struct inode_state_flags { enum inode_state_flags_enum __state; }; /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif kuid_t i_uid; kgid_t i_gid; const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ u64 i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; time64_t i_atime_sec; time64_t i_mtime_sec; time64_t i_ctime_sec; u32 i_atime_nsec; u32 i_mtime_nsec; u32 i_ctime_nsec; u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ struct inode_state_flags i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; union { struct list_head i_devices; int i_linklen; }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; /* * i_state handling * * We hide all of it behind helpers so that we can validate consumers. */ static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) { return READ_ONCE(inode->i_state.__state); } static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) { lockdep_assert_held(&inode->i_lock); return inode->i_state.__state; } static inline void inode_state_set_raw(struct inode *inode, enum inode_state_flags_enum flags) { WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags); } static inline void inode_state_set(struct inode *inode, enum inode_state_flags_enum flags) { lockdep_assert_held(&inode->i_lock); inode_state_set_raw(inode, flags); } static inline void inode_state_clear_raw(struct inode *inode, enum inode_state_flags_enum flags) { WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags); } static inline void inode_state_clear(struct inode *inode, enum inode_state_flags_enum flags) { lockdep_assert_held(&inode->i_lock); inode_state_clear_raw(inode, flags); } static inline void inode_state_assign_raw(struct inode *inode, enum inode_state_flags_enum flags) { WRITE_ONCE(inode->i_state.__state, flags); } static inline void inode_state_assign(struct inode *inode, enum inode_state_flags_enum flags) { lockdep_assert_held(&inode->i_lock); inode_state_assign_raw(inode, flags); } static inline void inode_state_replace_raw(struct inode *inode, enum inode_state_flags_enum clearflags, enum inode_state_flags_enum setflags) { enum inode_state_flags_enum flags; flags = inode->i_state.__state; flags &= ~clearflags; flags |= setflags; inode_state_assign_raw(inode, flags); } static inline void inode_state_replace(struct inode *inode, enum inode_state_flags_enum clearflags, enum inode_state_flags_enum setflags) { lockdep_assert_held(&inode->i_lock); inode_state_replace_raw(inode, clearflags, setflags); } static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; } /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. */ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit); static inline void inode_wake_up_bit(struct inode *inode, u32 bit) { /* Caller is responsible for correct memory barriers. */ wake_up_var(inode_state_wait_address(inode, bit)); } struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } void wait_on_new_inode(struct inode *inode); /* * inode->i_rwsem nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline __must_check int inode_lock_killable(struct inode *inode) { return down_write_killable(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline __must_check int inode_lock_shared_killable(struct inode *inode) { return down_read_killable(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } static inline void filemap_invalidate_lock(struct address_space *mapping) { down_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock(struct address_space *mapping) { up_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_lock_shared(struct address_space *mapping) { down_read(&mapping->invalidate_lock); } static inline int filemap_invalidate_trylock_shared( struct address_space *mapping) { return down_read_trylock(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock_shared( struct address_space *mapping) { up_read(&mapping->invalidate_lock); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2); void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size. */ smp_store_release(&inode->i_size, i_size); #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @order: Preferred folio order used for most recent readahead. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; unsigned int size; unsigned int async_size; unsigned int ra_pages; unsigned short order; unsigned short mmap_miss; loff_t prev_pos; }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } /** * struct file - Represents a file * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file * @__f_path: writable alias for @f_path; *ONLY* for core VFS and only before * the file gets open * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) * @f_ref: reference count */ struct file { spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ union { const struct path f_path; struct path __f_path; }; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */ u64 f_pipe; }; loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; freeptr_t f_freeptr; }; file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) { file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); #define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif int file_f_owner_allocate(struct file *file); static inline struct fown_struct *file_f_owner(const struct file *file) { return READ_ONCE(file->f_owner); } extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } /* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs. */ static inline struct dentry *file_dentry(const struct file *file) { struct dentry *dentry = file->f_path.dentry; WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); return dentry; } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; } /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(i_user_ns(inode), inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(i_user_ns(inode), inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(i_user_ns(inode), uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(i_user_ns(inode), gid); } /** * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_uid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_uid field needs to be updated, false if not. */ static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_gid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_gid field needs to be updated, false if not. */ static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && kgid_has_mapping(fs_userns, kgid); } struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update); static inline time64_t inode_get_atime_sec(const struct inode *inode) { return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), .tv_nsec = inode_get_atime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_atime_sec = ts.tv_sec; inode->i_atime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_atime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), .tv_nsec = inode_get_mtime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_mtime_sec = ts.tv_sec; inode->i_mtime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_mtime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_mtime_to_ts(inode, ts); } /* * Multigrain timestamps * * Conditionally use fine-grained ctime and mtime timestamps when there * are users actively observing them via getattr. The primary use-case * for this is NFS clients that use the ctime to distinguish between * different states of the file, and that are often fooled by multiple * operations that occur in the same coarse-grained timer tick. */ #define I_CTIME_QUERIED ((u32)BIT(31)) static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), .tv_nsec = inode_get_ctime_nsec(inode) }; return ts; } struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec } */ static inline struct timespec64 inode_set_ctime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_ctime_to_ts(inode, ts); } struct timespec64 simple_inode_init_ts(struct inode *inode); static inline int inode_time_dirty_flag(struct inode *inode) { if (inode->i_sb->s_flags & SB_LAZYTIME) return I_DIRTY_TIME; return I_DIRTY_SYNC; } /* * Snapshotting support. */ /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_started(file_inode(file)->i_sb); } /** * file_write_not_started - check if SB_FREEZE_WRITE is not held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_not_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_not_started(file_inode(file)->i_sb); } bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, struct delegated_inode *); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *, struct delegated_inode *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); /** * struct renamedata - contains all information required for renaming * @mnt_idmap: idmap of the mount in which the rename is happening. * @old_parent: parent of source * @old_dentry: source * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; struct dentry *new_parent; struct dentry *new_dentry; struct delegated_inode *delegated_inode; unsigned int flags; } __randomize_layout; int vfs_rename(struct renamedata *); static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV, NULL); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. * Return 'true' to keep going and 'false' if there are no more entries. */ struct dir_context; typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; /* * Filesystems MUST NOT MODIFY count, but may use as a hint: * 0 unknown * > 0 space in buffer (assume at least one entry) * INT_MAX unlimited */ int count; /* @actor supports these flags in d_type high bits */ unsigned int dt_flags_mask; }; /* If OR-ed with d_type, pending signals are not checked */ #define FILLDIR_FLAG_NOINTR 0x1000 /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops */ #define COPY_FILE_SPLICE (1 << 0) struct io_uring_cmd; struct offset_ctx; typedef unsigned int __bitwise fop_flags_t; struct file_operations { struct module *owner; fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); int (*mmap_prepare)(struct vm_area_desc *); } __randomize_layout; /* Supports async buffered reads */ #define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0)) /* Supports async buffered writes */ #define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1)) /* Supports synchronous page faults for mappings */ #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) /* Supports non-exclusive O_DIRECT writes from multiple threads */ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) /* Treat loff_t as unsigned (e.g., /dev/mem) */ #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ #define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, int (*) (struct file *, struct dir_context *)); #define WRAP_DIR_ITER(x) \ static int shared_##x(struct file *file , struct dir_context *ctx) \ { return wrap_directory_iterator(file, ctx, x); } enum fs_update_time { FS_UPD_ATIME, FS_UPD_CMTIME, }; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *inode, enum fs_update_time type, unsigned int flags); void (*sync_lazytime)(struct inode *inode); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa); int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; /* Did the driver provide valid mmap hook configuration? */ static inline bool can_mmap_file(struct file *file) { bool has_mmap = file->f_op->mmap; bool has_mmap_prepare = file->f_op->mmap_prepare; /* Hooks are mutually exclusive. */ if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) return false; if (!has_mmap && !has_mmap_prepare) return false; return true; } void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file, const struct vm_area_struct *vma); int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { int err; if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); err = file->f_op->mmap(file, vma); if (err) return err; return __vma_check_mmap_hook(vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) { return file->f_op->mmap_prepare(desc); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ #define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */ /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) #define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) #ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) #else #define IS_POSIXACL(inode) 0 #endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #ifdef CONFIG_SWAP #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #else #define IS_SWAPFILE(inode) ((void)(inode), 0U) #endif #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) #define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE) static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; } extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); } static inline void mark_inode_dirty_sync(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY_SYNC); } static inline int icount_read(const struct inode *inode) { return atomic_read(&inode->i_count); } /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. * Filesystems should call this if when writing an inode when lazytime is * enabled, they want to opportunistically write the timestamps of other inodes * located very nearby on-disk, e.g. in the same inode block. This returns true * if the given inode is in need of such an opportunistic update. Requires * i_lock, or at least later re-checking under i_lock. */ static inline bool inode_is_dirtytime_only(struct inode *inode) { return (inode_state_read_once(inode) & (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode); extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { inc_nlink(inode); mark_inode_dirty(inode); } static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); mark_inode_dirty(inode); } extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) touch_atime(&file->f_path); } extern int file_modified(struct file *file); int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key invalidate_lock_key; struct lock_class_key i_mutex_dir_key; }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) /** * is_mgtime: is this inode using multigrain timestamps * @inode: inode to test for multigrain timestamps * * Return true if the inode uses multigrain timestamps, false otherwise. */ static inline bool is_mgtime(const struct inode *inode) { return inode->i_opflags & IOP_MGTIME; } extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) ({ \ const struct file_operations *_fops = (fops); \ (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ }) #define fops_put(fops) ({ \ const struct file_operations *_fops = (fops); \ if (_fops) \ module_put((_fops)->owner); \ }) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \ fops_put(__file->f_op); \ BUG_ON(!(__file->f_op = (fops))); \ } while(0) extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) { if (WARN_ON(len > sizeof(sb->s_uuid))) len = sizeof(sb->s_uuid); sb->s_uuid_len = len; memcpy(&sb->s_uuid, uuid, len); } /* set sb sysfs name based on sb->s_bdev */ static inline void super_set_sysfs_name_bdev(struct super_block *sb) { snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); } /* set sb sysfs name based on sb->s_uuid */ static inline void super_set_sysfs_name_uuid(struct super_block *sb) { WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); } /* set sb sysfs name based on sb->s_id */ static inline void super_set_sysfs_name_id(struct super_block *sb) { strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); } /* try to use something standard before you use this */ __printf(2, 3) static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); va_end(args); } extern void ihold(struct inode * inode); extern void iput(struct inode *); void iput_not_last(struct inode *); int inode_update_time(struct inode *inode, enum fs_update_time type, unsigned int flags); int generic_update_time(struct inode *inode, enum fs_update_time type, unsigned int flags); /* /sys/fs */ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) /* fs/open.c */ struct audit_names; struct __filename_head { const char *name; /* pointer to actual string */ int refcnt; struct audit_names *aname; }; #define EMBEDDED_NAME_MAX (192 - sizeof(struct __filename_head)) struct filename { struct __filename_head; const char iname[EMBEDDED_NAME_MAX]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); static_assert(sizeof(struct filename) % 64 == 0); static inline struct mnt_idmap *file_mnt_idmap(const struct file *file) { return mnt_idmap(file->f_path.mnt); } /** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { return mnt_idmap(mnt) != &nop_mnt_idmap; } int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, const char *, int, umode_t); static inline struct file *file_open_root_mnt(struct vfsmount *mnt, const char *name, int flags, umode_t mode) { return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(struct path *path, int flags, umode_t mode, const struct cred *cred); const struct path *backing_file_user_path(const struct file *f); #ifdef CONFIG_SECURITY void *backing_file_security(const struct file *f); void backing_file_set_security(struct file *f, void *security); #else static inline void *backing_file_security(const struct file *f) { return NULL; } static inline void backing_file_set_security(struct file *f, void *security) { } #endif /* CONFIG_SECURITY */ /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying * filesystem. When the mapped file path and inode number are displayed to * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the * path and inode number to display to the user, which is the path of the fd * that user has requested to map and the inode number that would be returned * by fstat() on that same fd. */ /* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(const struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } /* Get the inode whose inode number to display in /proc/<pid>/maps */ static inline const struct inode *file_user_inode(const struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return d_inode(backing_file_user_path(f)->dentry); return file_inode(f); } static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); } extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); static inline struct filename *getname(const char __user *name) { return getname_flags(name, 0); } extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) { if (!(flags & AT_EMPTY_PATH)) return getname(name); if (!name) return NULL; return __getname_maybe_null(name); } extern void putname(struct filename *name); DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) struct delayed_filename { struct filename *__incomplete_filename; // don't touch }; #define INIT_DELAYED_FILENAME(ptr) \ ((void)(*(ptr) = (struct delayed_filename){})) int delayed_getname(struct delayed_filename *, const char __user *); int delayed_getname_uflags(struct delayed_filename *v, const char __user *, int); void dismiss_delayed_filename(struct delayed_filename *); int putname_to_delayed(struct delayed_filename *, struct filename *); struct filename *complete_getname(struct delayed_filename *); DEFINE_CLASS(filename, struct filename *, putname(_T), getname(p), const char __user *p) EXTEND_CLASS(filename, _kernel, getname_kernel(p), const char *p) EXTEND_CLASS(filename, _flags, getname_flags(p, f), const char __user *p, unsigned int f) EXTEND_CLASS(filename, _uflags, getname_uflags(p, f), const char __user *p, unsigned int f) EXTEND_CLASS(filename, _maybe_null, getname_maybe_null(p, f), const char __user *p, unsigned int f) EXTEND_CLASS(filename, _complete_delayed, complete_getname(p), struct delayed_filename *p) extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); /* Helper for the simple case when original dentry is used */ static inline int finish_open_simple(struct file *file, int error) { if (error) return error; return finish_open(file, file->f_path.dentry, NULL); } /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); #define __getname() kmalloc(PATH_MAX, GFP_KERNEL) #define __putname(name) kfree(name) void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; /* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); extern void unregister_chrdev_region(dev_t, unsigned); extern void chrdev_show(struct seq_file *,off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return __register_chrdev(major, 0, 256, name, fops); } static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } extern void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) { return file_write_and_wait_range(file, 0, LLONG_MAX); } extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); static inline bool iocb_is_dsync(const struct kiocb *iocb) { return (iocb->ki_flags & IOCB_DSYNC) || IS_SYNC(iocb->ki_filp->f_mapping->host); } /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed. */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; filemap_flush_range(mapping, iocb->ki_pos - count, iocb->ki_pos - 1); } return count; } extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK extern int bmap(struct inode *inode, sector_t *block); #else static inline int bmap(struct inode *inode, sector_t *block) { return -EINVAL; } #endif int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct delegated_inode *); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); int may_delete_dentry(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir); int may_create_dentry(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) { return (inode->i_mode ^ mode) & S_IFMT; } /** * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * * This is a variant of sb_start_write() which is a noop on non-regular file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_start_write_trylock(file_inode(file)->i_sb); } /** * file_end_write - drop write access to a superblock of a regular file * @file: the file we wrote to * * Should be matched with a call to file_start_write(). */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_end_write(file_inode(file)->i_sb); } /** * kiocb_start_write - get write access to a superblock for async file io * @iocb: the io context we want to submit the write with * * This is a variant of sb_start_write() for async io submission. * Should be matched with a call to kiocb_end_write(). */ static inline void kiocb_start_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); sb_start_write(inode->i_sb); /* * Fool lockdep by telling it the lock got released so that it * doesn't complain about the held lock when we return to userspace. */ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); } /** * kiocb_end_write - drop write access to a superblock after async file io * @iocb: the io context we sumbitted the write with * * Should be matched with a call to kiocb_start_write(). */ static inline void kiocb_end_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); /* * Tell lockdep we inherited freeze protection from submission thread. */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); sb_end_write(inode->i_sb); } /* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap * read-write shared) accesses. * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * deny_write_access() denies write access to a file. * allow_write_access() re-enables write access to a file. * * The i_writecount field of an inode can have the following values: * 0: no write access, no denied write access * < 0: (-i_writecount) users that denied write access to the file. * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong. */ static inline int get_write_access(struct inode *inode) { return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline int deny_write_access(struct file *file) { struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); } static inline void allow_write_access(struct file *file) { if (file) atomic_inc(&file_inode(file)->i_writecount); } /* * Do not prevent write to executable file when watched by pre-content events. * * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at * the time of file open and remains constant for entire lifetime of the file, * so if pre-content watches are added post execution or removed before the end * of the execution, it will not cause i_writecount reference leak. */ static inline int exe_file_deny_write_access(struct file *exe_file) { if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return 0; return deny_write_access(exe_file); } static inline void exe_file_allow_write_access(struct file *exe_file) { if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return; allow_write_access(exe_file); } static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) { file->f_mode &= ~FMODE_FSNOTIFY_MASK; file->f_mode |= mode; } static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; } #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { atomic_inc(&inode->i_readcount); } #else static inline void i_readcount_dec(struct inode *inode) { return; } static inline void i_readcount_inc(struct inode *inode) { return; } #endif extern int do_pipe_flags(int *, int); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); u64 vfsmount_to_propagation_flags(struct vfsmount *mnt); extern char *file_path(struct file *, char *, int); static inline bool name_is_dot(const char *name, size_t len) { return unlikely(len == 1 && name[0] == '.'); } static inline bool name_is_dotdot(const char *name, size_t len) { return unlikely(len == 2 && name[0] == '.' && name[1] == '.'); } /** * name_is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes */ static inline bool name_is_dot_dotdot(const char *name, size_t len) { return len && unlikely(name[0] == '.') && (len == 1 || (len == 2 && name[1] == '.')); } /** * name_contains_dotdot - check if a file name contains ".." path components * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) { size_t name_len; name_len = strlen(name); return strcmp(name, "..") == 0 || strncmp(name, "../", 3) == 0 || strstr(name, "/../") != NULL || (name_len >= 3 && strcmp(name + name_len - 3, "/..") == 0); } #include <linux/err.h> /* needed for stackable file system support */ loff_t default_llseek(struct file *file, loff_t offset, int whence); loff_t vfs_llseek(struct file *file, loff_t offset, int whence); int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } void inode_init_once(struct inode *inode); void address_space_init_once(struct address_space *mapping); struct inode *igrab(struct inode *inode); ino_t iunique(struct super_block *sb, ino_t max_reserved); int inode_needs_sync(struct inode *inode); int inode_just_drop(struct inode *inode); static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } void d_mark_dontcache(struct inode *inode); struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data, bool *isnew); struct inode *ilookup5(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data); struct inode *ilookup(struct super_block *sb, u64 ino); struct inode *inode_insert5(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget5_locked(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget_locked(struct super_block *sb, u64 ino); struct inode *find_inode_nowait(struct super_block *sb, u64 hashval, int (*match)(struct inode *, u64, void *), void *data); struct inode *find_inode_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data); struct inode *find_inode_by_ino_rcu(struct super_block *sb, u64 ino); int insert_inode_locked4(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), void *data); int insert_inode_locked(struct inode *inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif void unlock_new_inode(struct inode *inode); void discard_new_inode(struct inode *inode); unsigned int get_next_ino(void); void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry. */ static inline bool is_zero_ino(ino_t ino) { return (u32)ino == 0; } static inline void __iget(struct inode *inode) { lockdep_assert_held(&inode->i_lock); atomic_inc(&inode->i_count); } extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); struct inode *alloc_inode(struct super_block *sb); static inline struct inode *new_inode_pseudo(struct super_block *sb) { return alloc_inode(sb); } extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); /* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) void __insert_inode_hash(struct inode *inode, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } void __remove_inode_hash(struct inode *inode); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } void inode_sb_list_add(struct inode *inode); void inode_lru_list_add(struct inode *inode); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter); ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/splice.c */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, loff_t file_offset); enum { /* need locking between buffered and direct access */ DIO_LOCKING = 0x01, /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, }; ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_begin(struct inode *inode) { atomic_inc(&inode->i_dio_count); } /** * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link_raw(struct dentry *, struct inode *, struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, unsigned int unit_max, unsigned int unit_max_opt); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); static inline loff_t __inode_get_bytes(struct inode *inode) { return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; } loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); const char *simple_get_link(struct dentry *, struct inode *, struct delayed_call *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); void end_dirop(struct dentry *de); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern void __simple_unlink(struct inode *, struct dentry *); extern void __simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern void simple_remove_by_name(struct dentry *, const char *, void (*callback)(struct dentry *)); extern void locked_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name, const struct inode *context_inode); extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); struct dentry *simple_start_creating(struct dentry *, const char *); void simple_done_creating(struct dentry *); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { struct maple_tree mt; unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); void simple_offset_destroy(struct offset_ctx *octx); extern const struct file_operations simple_offset_dir_operations; extern int simple_fsync_noflush(struct file *, loff_t, loff_t, int); extern int simple_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); extern int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len); #if IS_ENABLED(CONFIG_UNICODE) int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory * * This functions checks if the proposed filename is valid for the * parent directory. That means that only valid UTF-8 filenames will be * accepted for casefold directories from filesystems created with the * strict encoding flag. That also means that any name will be * accepted for directories that doesn't have casefold enabled, or * aren't being strict with the encoding. * * @dir: inode of the directory where the new file will be created * @name: name of the new file * * Return: * * True: if the filename is suitable for this directory. It can be * true if a given name is not suitable for a strict encoding * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ static inline bool generic_ci_validate_strict_name(struct inode *dir, const struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; /* * A casefold dir must have a encoding set, unless the filesystem * is corrupted */ if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) return true; return !utf8_validate(dir->i_sb->s_encoding, name); } #else static inline bool generic_ci_validate_strict_name(struct inode *dir, const struct qstr *name) { return true; } #endif int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); static inline bool file_is_dax(const struct file *file) { return file && IS_DAX(file->f_mapping->host); } static inline bool vma_is_dax(const struct vm_area_struct *vma) { return file_is_dax(vma->vm_file); } static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) return false; if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } static inline int iocb_flags(struct file *file) { int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; return res; } static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int rw_type) { int kiocb_flags = 0; /* make sure there's no overlap between RWF and private IOCB flags */ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD); if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) return -EOPNOTSUPP; if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } if (flags & RWF_DONTCACHE) { /* file system must support it */ if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) return -EOPNOTSUPP; /* DAX mappings not supported */ if (IS_DAX(ki->ki_filp->f_mapping->host)) return -EOPNOTSUPP; } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { if (IS_APPEND(file_inode(ki->ki_filp))) return -EPERM; ki->ki_flags &= ~IOCB_APPEND; } ki->ki_flags |= kiocb_flags; return 0; } /* Transaction based IO helpers */ /* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content */ struct simple_transaction_argresp { ssize_t size; char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) #define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt); int simple_attr_release(struct inode *inode, struct file *file); ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC)) inode->i_flags |= S_NOSEC; } static inline bool is_root_inode(struct inode *inode) { return inode == inode->i_sb->s_root->d_inode; } static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { unsigned int dt_mask = S_DT_MASK | ctx->dt_flags_mask; return ctx->actor(ctx, name, namelen, ctx->pos, ino, type & dt_mask); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, file->f_path.dentry->d_inode->i_ino, DT_DIR); } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return false; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return false; ctx->pos = 2; } return true; } static inline bool dir_relax(struct inode *inode) { inode_unlock(inode); inode_lock(inode); return !IS_DEADDIR(inode); } static inline bool dir_relax_shared(struct inode *inode) { inode_unlock_shared(inode); inode_lock_shared(inode); return !IS_DEADDIR(inode); } extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); static inline bool vfs_empty_path(int dfd, const char __user *path) { char c; if (dfd < 0) return false; /* We now allow NULL to be used for empty path. */ if (!path) return true; if (unlikely(get_user(c, path))) return false; return !c; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); static inline bool extensible_ioctl_valid(unsigned int cmd_a, unsigned int cmd_b, size_t min_size) { if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) return false; if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) return false; if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) return false; if (_IOC_SIZE(cmd_a) < min_size) return false; return true; } #endif /* _LINUX_FS_H */
2 57 5 58 1 1 1 16 2 2 2 6 6 15 26 1 25 1 1 2 1 28 1 27 35 10 25 25 25 1 10 7 10 10 25 9 29 46 11 19 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 /* SPDX-License-Identifier: GPL-2.0-only */ /* * v4l2-tpg.h - Test Pattern Generator * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #ifndef _V4L2_TPG_H_ #define _V4L2_TPG_H_ #include <linux/types.h> #include <linux/errno.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/videodev2.h> struct tpg_rbg_color8 { unsigned char r, g, b; }; struct tpg_rbg_color16 { __u16 r, g, b; }; enum tpg_color { TPG_COLOR_CSC_WHITE, TPG_COLOR_CSC_YELLOW, TPG_COLOR_CSC_CYAN, TPG_COLOR_CSC_GREEN, TPG_COLOR_CSC_MAGENTA, TPG_COLOR_CSC_RED, TPG_COLOR_CSC_BLUE, TPG_COLOR_CSC_BLACK, TPG_COLOR_75_YELLOW, TPG_COLOR_75_CYAN, TPG_COLOR_75_GREEN, TPG_COLOR_75_MAGENTA, TPG_COLOR_75_RED, TPG_COLOR_75_BLUE, TPG_COLOR_100_WHITE, TPG_COLOR_100_YELLOW, TPG_COLOR_100_CYAN, TPG_COLOR_100_GREEN, TPG_COLOR_100_MAGENTA, TPG_COLOR_100_RED, TPG_COLOR_100_BLUE, TPG_COLOR_100_BLACK, TPG_COLOR_TEXTFG, TPG_COLOR_TEXTBG, TPG_COLOR_RANDOM, TPG_COLOR_RAMP, TPG_COLOR_MAX = TPG_COLOR_RAMP + 256 }; extern const struct tpg_rbg_color8 tpg_colors[TPG_COLOR_MAX]; extern const unsigned short tpg_rec709_to_linear[255 * 16 + 1]; extern const unsigned short tpg_linear_to_rec709[255 * 16 + 1]; extern const struct tpg_rbg_color16 tpg_csc_colors[V4L2_COLORSPACE_DCI_P3 + 1] [V4L2_XFER_FUNC_SMPTE2084 + 1] [TPG_COLOR_CSC_BLACK + 1]; enum tpg_pattern { TPG_PAT_75_COLORBAR, TPG_PAT_100_COLORBAR, TPG_PAT_CSC_COLORBAR, TPG_PAT_100_HCOLORBAR, TPG_PAT_100_COLORSQUARES, TPG_PAT_BLACK, TPG_PAT_WHITE, TPG_PAT_RED, TPG_PAT_GREEN, TPG_PAT_BLUE, TPG_PAT_CHECKERS_16X16, TPG_PAT_CHECKERS_2X2, TPG_PAT_CHECKERS_1X1, TPG_PAT_COLOR_CHECKERS_2X2, TPG_PAT_COLOR_CHECKERS_1X1, TPG_PAT_ALTERNATING_HLINES, TPG_PAT_ALTERNATING_VLINES, TPG_PAT_CROSS_1_PIXEL, TPG_PAT_CROSS_2_PIXELS, TPG_PAT_CROSS_10_PIXELS, TPG_PAT_GRAY_RAMP, /* Must be the last pattern */ TPG_PAT_NOISE, }; extern const char * const tpg_pattern_strings[]; enum tpg_quality { TPG_QUAL_COLOR, TPG_QUAL_GRAY, TPG_QUAL_NOISE }; enum tpg_video_aspect { TPG_VIDEO_ASPECT_IMAGE, TPG_VIDEO_ASPECT_4X3, TPG_VIDEO_ASPECT_14X9_CENTRE, TPG_VIDEO_ASPECT_16X9_CENTRE, TPG_VIDEO_ASPECT_16X9_ANAMORPHIC, }; enum tpg_pixel_aspect { TPG_PIXEL_ASPECT_SQUARE, TPG_PIXEL_ASPECT_NTSC, TPG_PIXEL_ASPECT_PAL, }; enum tpg_move_mode { TPG_MOVE_NEG_FAST, TPG_MOVE_NEG, TPG_MOVE_NEG_SLOW, TPG_MOVE_NONE, TPG_MOVE_POS_SLOW, TPG_MOVE_POS, TPG_MOVE_POS_FAST, }; enum tgp_color_enc { TGP_COLOR_ENC_RGB, TGP_COLOR_ENC_YCBCR, TGP_COLOR_ENC_HSV, TGP_COLOR_ENC_LUMA, }; extern const char * const tpg_aspect_strings[]; #define TPG_MAX_PLANES 3 #define TPG_MAX_PAT_LINES 8 struct tpg_data { /* Source frame size */ unsigned src_width, src_height; /* Buffer height */ unsigned buf_height; /* Scaled output frame size */ unsigned scaled_width; u32 field; bool field_alternate; /* crop coordinates are frame-based */ struct v4l2_rect crop; /* compose coordinates are format-based */ struct v4l2_rect compose; /* border and square coordinates are frame-based */ struct v4l2_rect border; struct v4l2_rect square; /* Color-related fields */ enum tpg_quality qual; unsigned qual_offset; u8 alpha_component; bool alpha_red_only; u8 brightness; u8 contrast; u8 saturation; s16 hue; u32 fourcc; enum tgp_color_enc color_enc; u32 colorspace; u32 xfer_func; u32 ycbcr_enc; u32 hsv_enc; /* * Stores the actual transfer function, i.e. will never be * V4L2_XFER_FUNC_DEFAULT. */ u32 real_xfer_func; /* * Stores the actual Y'CbCr encoding, i.e. will never be * V4L2_YCBCR_ENC_DEFAULT. */ u32 real_hsv_enc; u32 real_ycbcr_enc; u32 quantization; /* * Stores the actual quantization, i.e. will never be * V4L2_QUANTIZATION_DEFAULT. */ u32 real_quantization; enum tpg_video_aspect vid_aspect; enum tpg_pixel_aspect pix_aspect; unsigned rgb_range; unsigned real_rgb_range; unsigned buffers; unsigned planes; bool interleaved; u8 vdownsampling[TPG_MAX_PLANES]; u8 hdownsampling[TPG_MAX_PLANES]; /* * horizontal positions must be ANDed with this value to enforce * correct boundaries for packed YUYV values. */ unsigned hmask[TPG_MAX_PLANES]; /* Used to store the colors in native format, either RGB or YUV */ u8 colors[TPG_COLOR_MAX][3]; u8 textfg[TPG_MAX_PLANES][8], textbg[TPG_MAX_PLANES][8]; /* size in bytes for two pixels in each plane */ unsigned twopixelsize[TPG_MAX_PLANES]; unsigned bytesperline[TPG_MAX_PLANES]; /* Configuration */ enum tpg_pattern pattern; bool hflip; bool vflip; unsigned perc_fill; bool perc_fill_blank; bool show_border; bool show_square; bool insert_sav; bool insert_eav; bool insert_hdmi_video_guard_band; /* Test pattern movement */ enum tpg_move_mode mv_hor_mode; int mv_hor_count; int mv_hor_step; enum tpg_move_mode mv_vert_mode; int mv_vert_count; int mv_vert_step; bool recalc_colors; bool recalc_lines; bool recalc_square_border; /* Used to store TPG_MAX_PAT_LINES lines, each with up to two planes */ unsigned max_line_width; u8 *lines[TPG_MAX_PAT_LINES][TPG_MAX_PLANES]; u8 *downsampled_lines[TPG_MAX_PAT_LINES][TPG_MAX_PLANES]; u8 *random_line[TPG_MAX_PLANES]; u8 *contrast_line[TPG_MAX_PLANES]; u8 *black_line[TPG_MAX_PLANES]; }; void tpg_init(struct tpg_data *tpg, unsigned w, unsigned h); int tpg_alloc(struct tpg_data *tpg, unsigned max_w); void tpg_free(struct tpg_data *tpg); void tpg_reset_source(struct tpg_data *tpg, unsigned width, unsigned height, u32 field); void tpg_log_status(struct tpg_data *tpg); void tpg_set_font(const u8 *f); void tpg_gen_text(const struct tpg_data *tpg, u8 *basep[TPG_MAX_PLANES][2], int y, int x, const char *text); void tpg_calc_text_basep(struct tpg_data *tpg, u8 *basep[TPG_MAX_PLANES][2], unsigned p, u8 *vbuf); unsigned tpg_g_interleaved_plane(const struct tpg_data *tpg, unsigned buf_line); void tpg_fill_plane_buffer(struct tpg_data *tpg, v4l2_std_id std, unsigned p, u8 *vbuf); void tpg_fillbuffer(struct tpg_data *tpg, v4l2_std_id std, unsigned p, u8 *vbuf); bool tpg_s_fourcc(struct tpg_data *tpg, u32 fourcc); void tpg_s_crop_compose(struct tpg_data *tpg, const struct v4l2_rect *crop, const struct v4l2_rect *compose); const char *tpg_g_color_order(const struct tpg_data *tpg); static inline void tpg_s_pattern(struct tpg_data *tpg, enum tpg_pattern pattern) { if (tpg->pattern == pattern) return; tpg->pattern = pattern; tpg->recalc_colors = true; } static inline void tpg_s_quality(struct tpg_data *tpg, enum tpg_quality qual, unsigned qual_offset) { if (tpg->qual == qual && tpg->qual_offset == qual_offset) return; tpg->qual = qual; tpg->qual_offset = qual_offset; tpg->recalc_colors = true; } static inline enum tpg_quality tpg_g_quality(const struct tpg_data *tpg) { return tpg->qual; } static inline void tpg_s_alpha_component(struct tpg_data *tpg, u8 alpha_component) { if (tpg->alpha_component == alpha_component) return; tpg->alpha_component = alpha_component; tpg->recalc_colors = true; } static inline void tpg_s_alpha_mode(struct tpg_data *tpg, bool red_only) { if (tpg->alpha_red_only == red_only) return; tpg->alpha_red_only = red_only; tpg->recalc_colors = true; } static inline void tpg_s_brightness(struct tpg_data *tpg, u8 brightness) { if (tpg->brightness == brightness) return; tpg->brightness = brightness; tpg->recalc_colors = true; } static inline void tpg_s_contrast(struct tpg_data *tpg, u8 contrast) { if (tpg->contrast == contrast) return; tpg->contrast = contrast; tpg->recalc_colors = true; } static inline void tpg_s_saturation(struct tpg_data *tpg, u8 saturation) { if (tpg->saturation == saturation) return; tpg->saturation = saturation; tpg->recalc_colors = true; } static inline void tpg_s_hue(struct tpg_data *tpg, s16 hue) { hue = clamp_t(s16, hue, -128, 128); if (tpg->hue == hue) return; tpg->hue = hue; tpg->recalc_colors = true; } static inline void tpg_s_rgb_range(struct tpg_data *tpg, unsigned rgb_range) { if (tpg->rgb_range == rgb_range) return; tpg->rgb_range = rgb_range; tpg->recalc_colors = true; } static inline void tpg_s_real_rgb_range(struct tpg_data *tpg, unsigned rgb_range) { if (tpg->real_rgb_range == rgb_range) return; tpg->real_rgb_range = rgb_range; tpg->recalc_colors = true; } static inline void tpg_s_colorspace(struct tpg_data *tpg, u32 colorspace) { if (tpg->colorspace == colorspace) return; tpg->colorspace = colorspace; tpg->recalc_colors = true; } static inline u32 tpg_g_colorspace(const struct tpg_data *tpg) { return tpg->colorspace; } static inline void tpg_s_ycbcr_enc(struct tpg_data *tpg, u32 ycbcr_enc) { if (tpg->ycbcr_enc == ycbcr_enc) return; tpg->ycbcr_enc = ycbcr_enc; tpg->recalc_colors = true; } static inline u32 tpg_g_ycbcr_enc(const struct tpg_data *tpg) { return tpg->ycbcr_enc; } static inline void tpg_s_hsv_enc(struct tpg_data *tpg, u32 hsv_enc) { if (tpg->hsv_enc == hsv_enc) return; tpg->hsv_enc = hsv_enc; tpg->recalc_colors = true; } static inline u32 tpg_g_hsv_enc(const struct tpg_data *tpg) { return tpg->hsv_enc; } static inline void tpg_s_xfer_func(struct tpg_data *tpg, u32 xfer_func) { if (tpg->xfer_func == xfer_func) return; tpg->xfer_func = xfer_func; tpg->recalc_colors = true; } static inline u32 tpg_g_xfer_func(const struct tpg_data *tpg) { return tpg->xfer_func; } static inline void tpg_s_quantization(struct tpg_data *tpg, u32 quantization) { if (tpg->quantization == quantization) return; tpg->quantization = quantization; tpg->recalc_colors = true; } static inline u32 tpg_g_quantization(const struct tpg_data *tpg) { return tpg->quantization; } static inline unsigned tpg_g_buffers(const struct tpg_data *tpg) { return tpg->buffers; } static inline unsigned tpg_g_planes(const struct tpg_data *tpg) { return tpg->interleaved ? 1 : tpg->planes; } static inline bool tpg_g_interleaved(const struct tpg_data *tpg) { return tpg->interleaved; } static inline unsigned tpg_g_twopixelsize(const struct tpg_data *tpg, unsigned plane) { return tpg->twopixelsize[plane]; } static inline unsigned tpg_hdiv(const struct tpg_data *tpg, unsigned plane, unsigned x) { return ((x / tpg->hdownsampling[plane]) & tpg->hmask[plane]) * tpg->twopixelsize[plane] / 2; } static inline unsigned tpg_hscale(const struct tpg_data *tpg, unsigned x) { return (x * tpg->scaled_width) / tpg->src_width; } static inline unsigned tpg_hscale_div(const struct tpg_data *tpg, unsigned plane, unsigned x) { return tpg_hdiv(tpg, plane, tpg_hscale(tpg, x)); } static inline unsigned tpg_g_bytesperline(const struct tpg_data *tpg, unsigned plane) { return tpg->bytesperline[plane]; } static inline void tpg_s_bytesperline(struct tpg_data *tpg, unsigned plane, unsigned bpl) { unsigned p; if (tpg->buffers > 1) { tpg->bytesperline[plane] = bpl; return; } for (p = 0; p < tpg_g_planes(tpg); p++) { unsigned plane_w = bpl * tpg->twopixelsize[p] / tpg->twopixelsize[0]; tpg->bytesperline[p] = plane_w / tpg->hdownsampling[p]; } if (tpg_g_interleaved(tpg)) tpg->bytesperline[1] = tpg->bytesperline[0]; } static inline unsigned tpg_g_line_width(const struct tpg_data *tpg, unsigned plane) { unsigned w = 0; unsigned p; if (tpg->buffers > 1) return tpg_g_bytesperline(tpg, plane); for (p = 0; p < tpg_g_planes(tpg); p++) { unsigned plane_w = tpg_g_bytesperline(tpg, p); w += plane_w / tpg->vdownsampling[p]; } return w; } static inline unsigned tpg_calc_line_width(const struct tpg_data *tpg, unsigned plane, unsigned bpl) { unsigned w = 0; unsigned p; if (tpg->buffers > 1) return bpl; for (p = 0; p < tpg_g_planes(tpg); p++) { unsigned plane_w = bpl * tpg->twopixelsize[p] / tpg->twopixelsize[0]; plane_w /= tpg->hdownsampling[p]; w += plane_w / tpg->vdownsampling[p]; } return w; } static inline unsigned tpg_calc_plane_size(const struct tpg_data *tpg, unsigned plane) { if (plane >= tpg_g_planes(tpg)) return 0; return tpg_g_bytesperline(tpg, plane) * tpg->buf_height / tpg->vdownsampling[plane]; } static inline void tpg_s_buf_height(struct tpg_data *tpg, unsigned h) { tpg->buf_height = h; } static inline void tpg_s_field(struct tpg_data *tpg, unsigned field, bool alternate) { tpg->field = field; tpg->field_alternate = alternate; } static inline void tpg_s_perc_fill(struct tpg_data *tpg, unsigned perc_fill) { tpg->perc_fill = perc_fill; } static inline unsigned tpg_g_perc_fill(const struct tpg_data *tpg) { return tpg->perc_fill; } static inline void tpg_s_perc_fill_blank(struct tpg_data *tpg, bool perc_fill_blank) { tpg->perc_fill_blank = perc_fill_blank; } static inline void tpg_s_video_aspect(struct tpg_data *tpg, enum tpg_video_aspect vid_aspect) { if (tpg->vid_aspect == vid_aspect) return; tpg->vid_aspect = vid_aspect; tpg->recalc_square_border = true; } static inline enum tpg_video_aspect tpg_g_video_aspect(const struct tpg_data *tpg) { return tpg->vid_aspect; } static inline void tpg_s_pixel_aspect(struct tpg_data *tpg, enum tpg_pixel_aspect pix_aspect) { if (tpg->pix_aspect == pix_aspect) return; tpg->pix_aspect = pix_aspect; tpg->recalc_square_border = true; } static inline void tpg_s_show_border(struct tpg_data *tpg, bool show_border) { tpg->show_border = show_border; } static inline void tpg_s_show_square(struct tpg_data *tpg, bool show_square) { tpg->show_square = show_square; } static inline void tpg_s_insert_sav(struct tpg_data *tpg, bool insert_sav) { tpg->insert_sav = insert_sav; } static inline void tpg_s_insert_eav(struct tpg_data *tpg, bool insert_eav) { tpg->insert_eav = insert_eav; } /* * This inserts 4 pixels of the RGB color 0xab55ab at the left hand side of the * image. This is only done for 3 or 4 byte RGB pixel formats. This pixel value * equals the Video Guard Band value as defined by HDMI (see section 5.2.2.1 * in the HDMI 1.3 Specification) that preceeds the first actual pixel. If the * HDMI receiver doesn't handle this correctly, then it might keep skipping * these Video Guard Band patterns and end up with a shorter video line. So this * is a nice pattern to test with. */ static inline void tpg_s_insert_hdmi_video_guard_band(struct tpg_data *tpg, bool insert_hdmi_video_guard_band) { tpg->insert_hdmi_video_guard_band = insert_hdmi_video_guard_band; } void tpg_update_mv_step(struct tpg_data *tpg); static inline void tpg_s_mv_hor_mode(struct tpg_data *tpg, enum tpg_move_mode mv_hor_mode) { tpg->mv_hor_mode = mv_hor_mode; tpg_update_mv_step(tpg); } static inline void tpg_s_mv_vert_mode(struct tpg_data *tpg, enum tpg_move_mode mv_vert_mode) { tpg->mv_vert_mode = mv_vert_mode; tpg_update_mv_step(tpg); } static inline void tpg_init_mv_count(struct tpg_data *tpg) { tpg->mv_hor_count = tpg->mv_vert_count = 0; } static inline void tpg_update_mv_count(struct tpg_data *tpg, bool frame_is_field) { tpg->mv_hor_count += tpg->mv_hor_step * (frame_is_field ? 1 : 2); tpg->mv_vert_count += tpg->mv_vert_step * (frame_is_field ? 1 : 2); } static inline void tpg_s_hflip(struct tpg_data *tpg, bool hflip) { if (tpg->hflip == hflip) return; tpg->hflip = hflip; tpg_update_mv_step(tpg); tpg->recalc_lines = true; } static inline bool tpg_g_hflip(const struct tpg_data *tpg) { return tpg->hflip; } static inline void tpg_s_vflip(struct tpg_data *tpg, bool vflip) { tpg->vflip = vflip; } static inline bool tpg_g_vflip(const struct tpg_data *tpg) { return tpg->vflip; } static inline bool tpg_pattern_is_static(const struct tpg_data *tpg) { return tpg->pattern != TPG_PAT_NOISE && tpg->mv_hor_mode == TPG_MOVE_NONE && tpg->mv_vert_mode == TPG_MOVE_NONE; } #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2006 USAGI/WIDE Project * * Author: * Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com> * * Based on net/netfilter/xt_tcpudp.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/mip6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6t_mh.h> MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); MODULE_LICENSE("GPL"); /* Returns 1 if the type is matched by the range, 0 otherwise */ static inline bool type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) { return (type >= min && type <= max) ^ invert; } static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip6_mh _mh; const struct ip6_mh *mh; const struct ip6t_mh *mhinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh); if (mh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil MH tinygram.\n"); par->hotdrop = true; return false; } if (mh->ip6mh_proto != IPPROTO_NONE) { pr_debug("Dropping invalid MH Payload Proto: %u\n", mh->ip6mh_proto); par->hotdrop = true; return false; } return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type, !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); } static int mh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_mh *mhinfo = par->matchinfo; /* Must specify no unknown invflags */ return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0; } static struct xt_match mh_mt6_reg __read_mostly = { .name = "mh", .family = NFPROTO_IPV6, .checkentry = mh_mt6_check, .match = mh_mt6, .matchsize = sizeof(struct ip6t_mh), .proto = IPPROTO_MH, .me = THIS_MODULE, }; static int __init mh_mt6_init(void) { return xt_register_match(&mh_mt6_reg); } static void __exit mh_mt6_exit(void) { xt_unregister_match(&mh_mt6_reg); } module_init(mh_mt6_init); module_exit(mh_mt6_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 /* SPDX-License-Identifier: GPL-2.0 */ /* Interface for implementing AF_XDP zero-copy support in drivers. * Copyright(c) 2020 Intel Corporation. */ #ifndef _LINUX_XDP_SOCK_DRV_H #define _LINUX_XDP_SOCK_DRV_H #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) #define NETDEV_XDP_ACT_XSK (NETDEV_XDP_ACT_BASIC | \ NETDEV_XDP_ACT_REDIRECT | \ NETDEV_XDP_ACT_XSK_ZEROCOPY) struct xsk_cb_desc { void *src; u8 off; u8 bytes; }; #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return XDP_PACKET_HEADROOM + pool->headroom; } static inline u32 xsk_pool_get_tailroom(bool mbuf) { return mbuf ? SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 0; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } static inline u32 __xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { u32 frame_size = __xsk_pool_get_rx_frame_size(pool); struct xdp_umem *umem = pool->umem; bool mbuf; /* Reserve tailroom only for zero-copy pools that opted into * multi-buffer. The reserved area is used for skb_shared_info, * matching the XDP core's xdp_data_hard_end() layout. */ mbuf = pool->dev && (umem->flags & XDP_UMEM_SG_FLAG); frame_size -= xsk_pool_get_tailroom(mbuf); return ALIGN_DOWN(frame_size, 128); } static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) { return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { xp_set_rxq_info(pool, rxq); } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { xp_fill_cb(pool, desc); } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { xp_dma_unmap(pool, attrs); } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { struct xdp_umem *umem = pool->umem; return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_dma(xskb); } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_frame_dma(xskb); } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return xp_alloc(pool); } static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return xp_alloc_batch(pool, xdp, max); } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); struct list_head *xskb_list = &xskb->pool->xskb_list; struct xdp_buff_xsk *pos, *tmp; if (likely(!xdp_buff_has_frags(xdp))) goto out; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { list_del_init(&pos->list_node); xp_free(pos); } xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; out: xp_free(xskb); } static inline bool xsk_buff_add_frag(struct xdp_buff *head, struct xdp_buff *xdp) { const void *data = xdp->data; struct xdp_buff_xsk *frag; if (!__xdp_buff_add_frag(head, virt_to_netmem(data), offset_in_page(data), xdp->data_end - data, xdp->frame_sz, false)) return false; frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->list_node, &frag->pool->xskb_list); return true; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); if (frag) { list_del_init(&frag->list_node); ret = &frag->xdp; } return ret; } static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); list_del_init(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff_xsk *frag; frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); return &frag->xdp; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); return &frag->xdp; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_dma(pool, addr); } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_data(pool, addr); } /** * xsk_buff_raw_get_ctx - get &xdp_desc context * @pool: XSk buff pool desc address belongs to * @addr: desc address (from userspace) * * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for * details. * * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata * pointer, if it is present and valid (initialized to %NULL otherwise). */ static inline struct xdp_desc_ctx xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_ctx(pool, addr); } #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ XDP_TXMD_FLAGS_LAUNCH_TIME | \ 0) static inline bool xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } static inline struct xsk_tx_metadata * __xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; meta = data - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } static inline struct xsk_tx_metadata * xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); xp_dma_sync_for_cpu(xskb); } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { xp_dma_sync_for_device(pool, dma, size); } #else static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return false; } static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) { return 0; } static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } static inline struct xsk_buff_pool * xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) { return 0; } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { return 0; } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } static inline void xsk_buff_free(struct xdp_buff *xdp) { } static inline bool xsk_buff_add_frag(struct xdp_buff *head, struct xdp_buff *xdp) { return false; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { } static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { return NULL; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { return NULL; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return 0; } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline struct xdp_desc_ctx xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) { return (struct xdp_desc_ctx){ }; } static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } static inline struct xsk_tx_metadata * __xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { return NULL; } static inline struct xsk_tx_metadata * xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { } #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */
24159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/include/linux/clockchips.h * * This file contains the structure definitions for clockchips. * * If you are not a clockchip, or the time of day code, you should * not be including this file! */ #ifndef _LINUX_CLOCKCHIPS_H #define _LINUX_CLOCKCHIPS_H #ifdef CONFIG_GENERIC_CLOCKEVENTS # include <linux/clocksource.h> # include <linux/cpumask_types.h> # include <linux/ktime.h> # include <linux/notifier.h> struct clock_event_device; struct module; /* * Possible states of a clock event device. * * DETACHED: Device is not used by clockevents core. Initial state or can be * reached from SHUTDOWN. * SHUTDOWN: Device is powered-off. Can be reached from PERIODIC or ONESHOT. * PERIODIC: Device is programmed to generate events periodically. Can be * reached from DETACHED or SHUTDOWN. * ONESHOT: Device is programmed to generate event only once. Can be reached * from DETACHED or SHUTDOWN. * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily * stopped. */ enum clock_event_state { CLOCK_EVT_STATE_DETACHED, CLOCK_EVT_STATE_SHUTDOWN, CLOCK_EVT_STATE_PERIODIC, CLOCK_EVT_STATE_ONESHOT, CLOCK_EVT_STATE_ONESHOT_STOPPED, }; /* * Clock event features */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 # define CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED 0x000004 /* * x86(64) specific (mis)features: * * - Clockevent source stops in C3 State and needs broadcast support. * - Local APIC timer is used as a dummy device. */ # define CLOCK_EVT_FEAT_C3STOP 0x000008 # define CLOCK_EVT_FEAT_DUMMY 0x000010 /* * Core shall set the interrupt affinity dynamically in broadcast mode */ # define CLOCK_EVT_FEAT_DYNIRQ 0x000020 # define CLOCK_EVT_FEAT_PERCPU 0x000040 /* * Clockevent device is based on a hrtimer for broadcast */ # define CLOCK_EVT_FEAT_HRTIMER 0x000080 /** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low * level handler of the event source * @set_next_event: set next event function using a clocksource delta * @set_next_ktime: set next event function using a direct ktime value * @set_next_coupled: set next event function for clocksource coupled mode * @next_event: local storage for the next event in oneshot mode * @max_delta_ns: maximum delta value in ns * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @cs_id: Clocksource ID to denote the clocksource for coupled mode * @next_event_forced: True if the last programming was a forced event * @retries: number of forced programming retries * @set_state_periodic: switch state to periodic * @set_state_oneshot: switch state to oneshot * @set_state_oneshot_stopped: switch state to oneshot_stopped * @set_state_shutdown: switch state to shutdown * @tick_resume: resume clkevt device * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) * @bound_on: Bound on CPU * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code * @owner: module reference */ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); void (*set_next_coupled)(u64 cycles, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; u32 mult; u32 shift; enum clock_event_state state_use_accessors; unsigned int features; enum clocksource_ids cs_id; unsigned int next_event_forced; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); int (*set_state_oneshot_stopped)(struct clock_event_device *); int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); void (*broadcast)(const struct cpumask *mask); void (*suspend)(struct clock_event_device *); void (*resume)(struct clock_event_device *); unsigned long min_delta_ticks; unsigned long max_delta_ticks; const char *name; int rating; int irq; int bound_on; const struct cpumask *cpumask; struct list_head list; struct module *owner; } ____cacheline_aligned; /* Helpers to verify state of a clockevent device */ static inline bool clockevent_state_detached(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED; } static inline bool clockevent_state_shutdown(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN; } static inline bool clockevent_state_periodic(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC; } static inline bool clockevent_state_oneshot(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT; } static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED; } /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: * * clock_ticks = (nanoseconds * factor) >> shift. * * div_sc is the rearranged equation to calculate a factor from a given clock * ticks / nanoseconds ratio: * * factor = (clock_ticks << shift) / nanoseconds */ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, int shift) { u64 tmp = ((u64)ticks) << shift; do_div(tmp, nsec); return (unsigned long) tmp; } /* Clock event layer functions */ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); static inline void clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec) { return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec); } extern void clockevents_suspend(void); extern void clockevents_resume(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST # ifdef CONFIG_ARCH_HAS_TICK_BROADCAST extern void tick_broadcast(const struct cpumask *mask); # else # define tick_broadcast NULL # endif extern int tick_receive_broadcast(void); # endif # if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); # else static __always_inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } # endif #else /* !CONFIG_GENERIC_CLOCKEVENTS: */ static inline void clockevents_suspend(void) { } static inline void clockevents_resume(void) { } static __always_inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #endif /* _LINUX_CLOCKCHIPS_H */
7430 7267 7247 4167 324 279 37 36 115 126 7455 7455 7458 7456 7482 7449 7459 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/export.h> #include <linux/fault-inject-usercopy.h> #include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <asm/word-at-a-time.h> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define IS_UNALIGNED(src, dst) 0 #else #define IS_UNALIGNED(src, dst) \ (((long) dst | (long) src) & (sizeof(long) - 1)) #endif /* * Do a strncpy, return length of string without final '\0'. * 'count' is the user-supplied count (return 'count' if we * hit it), 'max' is the address space maximum (and we return * -EFAULT if we hit it). */ static __always_inline long do_strncpy_from_user(char *dst, const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long res = 0; if (IS_UNALIGNED(src, dst)) goto byte_at_a_time; while (max >= sizeof(unsigned long)) { unsigned long c, data, mask; /* Fall back to byte-at-a-time if we get a page fault */ unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); /* * Note that we mask out the bytes following the NUL. This is * important to do because string oblivious code may read past * the NUL. For those routines, we don't want to give them * potentially random bytes after the NUL in `src`. * * One example of such code is BPF map keys. BPF treats map keys * as an opaque set of bytes. Without the post-NUL mask, any BPF * maps keyed by strings returned from strncpy_from_user() may * have multiple entries for semantically identical strings. */ if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); mask = zero_bytemask(data); *(unsigned long *)(dst+res) = c & mask; return res + find_zero(data); } *(unsigned long *)(dst+res) = c; res += sizeof(unsigned long); max -= sizeof(unsigned long); } byte_at_a_time: while (max) { char c; unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; res++; max--; } /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, that's ok - we got as much as the user asked for. */ if (res >= count) return res; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ efault: return -EFAULT; } /** * strncpy_from_user: - Copy a NUL terminated string from userspace. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @src: Source address, in user space. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from userspace to kernel space. * * On success, returns the length of the string (not including the trailing * NUL). * * If access to userspace fails, returns -EFAULT (some data may have been * copied). * * If @count is smaller than the length of the string, copies @count bytes * and returns @count. */ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; might_fault(); if (should_fail_usercopy()) return -EFAULT; if (unlikely(count <= 0)) return 0; kasan_check_write(dst, count); check_object_size(dst, count, false); if (can_do_masked_user_access()) { long retval; src = masked_user_read_access_begin(src); retval = do_strncpy_from_user(dst, src, count, count); user_read_access_end(); return retval; } max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); user_read_access_end(); return retval; } } return -EFAULT; } EXPORT_SYMBOL(strncpy_from_user);
227 108 54 125 125 125 125 125 124 45 351 86 3 3 3 2 3 2 2 1 1 1 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GRE_H #define __LINUX_GRE_H #include <linux/skbuff.h> #include <net/ip_tunnels.h> struct gre_base_hdr { __be16 flags; __be16 protocol; } __packed; struct gre_full_hdr { struct gre_base_hdr fixed_header; __be16 csum; __be16 reserved1; __be32 key; __be32 seq; } __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 #define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); }; int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); static inline bool netif_is_gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "gretap"); } static inline bool netif_is_ip6gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } static inline int gre_calc_hlen(const unsigned long *o_flags) { int addend = 4; if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags)) addend += 4; if (test_bit(IP_TUNNEL_KEY_BIT, o_flags)) addend += 4; if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags)) addend += 4; return addend; } static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags) { IP_TUNNEL_DECLARE_FLAGS(res) = { }; __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM); __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING); __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY); __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ); __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT); __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC); __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION); ip_tunnel_flags_copy(dst, res); } static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags) { __be16 flags = 0; if (test_bit(IP_TUNNEL_CSUM_BIT, tflags)) flags |= GRE_CSUM; if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags)) flags |= GRE_ROUTING; if (test_bit(IP_TUNNEL_KEY_BIT, tflags)) flags |= GRE_KEY; if (test_bit(IP_TUNNEL_SEQ_BIT, tflags)) flags |= GRE_SEQ; if (test_bit(IP_TUNNEL_STRICT_BIT, tflags)) flags |= GRE_STRICT; if (test_bit(IP_TUNNEL_REC_BIT, tflags)) flags |= GRE_REC; if (test_bit(IP_TUNNEL_VERSION_BIT, tflags)) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, const unsigned long *flags, __be16 proto, __be32 key, __be32 seq) { IP_TUNNEL_DECLARE_FLAGS(cond) = { }; struct gre_base_hdr *greh; skb_push(skb, hdr_len); skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; __set_bit(IP_TUNNEL_KEY_BIT, cond); __set_bit(IP_TUNNEL_CSUM_BIT, cond); __set_bit(IP_TUNNEL_SEQ_BIT, cond); if (ip_tunnel_flags_intersect(flags, cond)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) { *ptr = seq; ptr--; } if (test_bit(IP_TUNNEL_KEY_BIT, flags)) { *ptr = key; ptr--; } if (test_bit(IP_TUNNEL_CSUM_BIT, flags) && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; if (skb->ip_summed == CHECKSUM_PARTIAL) { *(__sum16 *)ptr = csum_fold(lco_csum(skb)); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } } } #endif
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 /* SPDX-License-Identifier: GPL-2.0 */ /* * BSD Process Accounting for Linux - Definitions * * Author: Marco van Wieringen (mvw@planets.elm.net) * * This header file contains the definitions needed to implement * BSD-style process accounting. The kernel accounting code and all * user-level programs that try to do something useful with the * process accounting log must include this file. * * Copyright (C) 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. * */ #ifndef _LINUX_ACCT_H #define _LINUX_ACCT_H #include <uapi/linux/acct.h> #ifdef CONFIG_BSD_PROCESS_ACCT struct pid_namespace; extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) #endif /* * ACCT_VERSION numbers as yet defined: * 0: old format (until 2.6.7) with 16 bit uid/gid * 1: extended variant (binary compatible on M68K) * 2: extended variant (binary compatible on everything except M68K) * 3: new binary incompatible format (64 bytes) * 4: new binary incompatible format (128 bytes) * 5: new binary incompatible format (128 bytes, second half) * */ #undef ACCT_VERSION #undef AHZ #ifdef CONFIG_BSD_PROCESS_ACCT_V3 #define ACCT_VERSION 3 #define AHZ 100 typedef struct acct_v3 acct_t; #else #ifdef CONFIG_M68K #define ACCT_VERSION 1 #else #define ACCT_VERSION 2 #endif #define AHZ (USER_HZ) typedef struct acct acct_t; #endif #include <linux/jiffies.h> /* * Yet another set of HZ to *HZ helper functions. * See <linux/jiffies.h> for the original. */ static inline u32 jiffies_to_AHZ(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0 # if HZ < AHZ return x * (AHZ / HZ); # else return x / (HZ / AHZ); # endif #else u64 tmp = (u64)x * TICK_NSEC; do_div(tmp, (NSEC_PER_SEC / AHZ)); return (long)tmp; #endif } static inline u64 nsec_to_AHZ(u64 x) { #if (NSEC_PER_SEC % AHZ) == 0 do_div(x, (NSEC_PER_SEC / AHZ)); #elif (AHZ % 512) == 0 x *= AHZ/512; do_div(x, (NSEC_PER_SEC / 512)); #else /* * max relative error 5.7e-8 (1.8s per year) for AHZ <= 1024, * overflow after 64.99 years. * exact for AHZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ x *= 9; do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (AHZ/2)) / AHZ)); #endif return x; } #endif /* _LINUX_ACCT_H */
227 48 41 9 61 23 23 12 6 6 4 61 23 15 14 15 9 9 61 57 57 14 14 12 12 61 57 21 56 56 54 54 61 15 15 14 15 5 165 47 166 163 166 70 16 159 16 2 19 72 47 47 35 29 26 26 72 92 92 91 79 79 79 91 91 91 3 145 42 21 32 29 23 135 5 23 145 145 13 12 12 12 15 8 16 2 14 14 14 14 1 8 13 3 3 1 12 6 6 6 12 7 12 16 4 3 4 27 14 21 27 34 34 7 34 29 7 5 34 21 34 13 8 1 21 6 6 6 6 2 34 1 1 1 1 92 45 12 91 48 17 31 31 9 9 22 20 20 20 20 2 20 20 20 45 2 12 8 4 2 53 3 53 16 16 53 22 9 40 9 40 7 34 16 16 34 3 32 32 32 19 8 11 2 9 32 32 32 32 32 20 70 9 9 9 9 9 7 3 1 9 25 1 1 1 21 19 7 19 17 16 17 14 19 14 13 13 7 4 2 3 2 3 5 5 4 4 2 1 2 25 50 47 47 47 47 49 20 21 21 21 21 47 45 48 79 38 79 76 76 79 10 8 7 7 8 346 355 79 79 79 78 79 79 79 79 79 79 79 63 17 47 55 53 25 53 2 1 2 53 64 10 6 1 5 5 4 5 4 10 143 127 143 143 143 16 10 10 10 10 159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/rculist.h> #include <linux/err.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) { return hash_32((__force u32)key ^ (__force u32)remote, IP_TNL_HASH_BITS); } static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p, const unsigned long *flags, __be32 key) { if (!test_bit(IP_TUNNEL_KEY_BIT, flags)) return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags); return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key; } /* Fallback tunnel: no source, no destination, no key, no options Tunnel hash table: We require exact key match i.e. if a key is present in packet it will match only tunnel with the same key; if it is not present, it will match only keyless tunnel. All keysless packets, if not matched configured keyless tunnels will match fallback tunnel. Given src, dst and key, find appropriate for input tunnel. */ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key) { struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; struct net_device *ndev; unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { if (local != t->parms.iph.saddr || remote != t->parms.iph.daddr || !(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (READ_ONCE(t->parms.link) == link) return t; cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (READ_ONCE(t->parms.link) == link) return t; if (!cand) cand = t; } hash = ip_tunnel_hash(key, 0); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) continue; if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (READ_ONCE(t->parms.link) == link) return t; if (!cand) cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) && t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; if (READ_ONCE(t->parms.link) == link) return t; if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(itn->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; ndev = READ_ONCE(itn->fb_tunnel_dev); if (ndev && ndev->flags & IFF_UP) return netdev_priv(ndev); return NULL; } EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, struct ip_tunnel_parm_kern *parms) { unsigned int h; __be32 remote; __be32 i_key = parms->i_key; if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) remote = parms->iph.daddr; else remote = 0; if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) && test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags)) i_key = 0; h = ip_tunnel_hash(i_key, remote); return &itn->tunnels[h]; } static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) { struct hlist_head *head = ip_bucket(itn, &t->parms); if (t->collect_md) rcu_assign_pointer(itn->collect_md_tun, t); hlist_add_head_rcu(&t->hash_node, head); } static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) { if (t->collect_md) rcu_assign_pointer(itn->collect_md_tun, NULL); hlist_del_init_rcu(&t->hash_node); } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, struct ip_tunnel_parm_kern *parms, int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; IP_TUNNEL_DECLARE_FLAGS(flags); __be32 key = parms->i_key; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); ip_tunnel_flags_copy(flags, parms->i_flags); hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && link == READ_ONCE(t->parms.link) && type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; } return t; } static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, struct ip_tunnel_parm_kern *parms) { int err; struct ip_tunnel *tunnel; struct net_device *dev; char name[IFNAMSIZ]; err = -E2BIG; if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; strscpy(name, parms->name); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; strscpy(name, ops->kind); strcat(name, "%d"); } ASSERT_RTNL(); dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); if (!dev) { err = -ENOMEM; goto failed; } dev_net_set(dev, net); dev->rtnl_link_ops = ops; tunnel = netdev_priv(dev); tunnel->parms = *parms; tunnel->net = net; err = register_netdevice(dev); if (err) goto failed_free; return dev; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *iph; int hlen = LL_MAX_HEADER; int mtu = ETH_DATA_LEN; int t_hlen = tunnel->hlen + sizeof(struct iphdr); iph = &tunnel->parms.iph; /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { struct flowi4 fl4; struct rtable *rt; ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; dst_cache_reset(&tunnel->dst_cache); } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = min(tdev->mtu, IP_MAX_MTU); } dev->needed_headroom = t_hlen + hlen; mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); if (mtu < IPV4_MIN_MTU) mtu = IPV4_MIN_MTU; return mtu; } static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, struct ip_tunnel_parm_kern *parms) { struct ip_tunnel *nt; struct net_device *dev; int t_hlen; int mtu; int err; dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); mtu = ip_tunnel_bind_dev(dev); err = dev_set_mtu(dev, mtu); if (err) goto err_dev_set_mtu; nt = netdev_priv(dev); t_hlen = nt->hlen + sizeof(struct iphdr); dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU - t_hlen; if (dev->type == ARPHRD_ETHER) dev->max_mtu -= dev->hard_header_len; ip_tunnel_add(itn, nt); return nt; err_dev_set_mtu: unregister_netdevice(dev); return ERR_PTR(err); } void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) { const struct iphdr *iph = ip_hdr(skb); const struct udphdr *udph; if (iph->protocol != IPPROTO_UDP) return; udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); info->encap.sport = udph->source; info->encap.dport = udph->dest; } EXPORT_SYMBOL(ip_tunnel_md_udp_encap); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error) { const struct iphdr *iph = ip_hdr(skb); int nh, err; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { DEV_STATS_INC(tunnel->dev, multicast); skb->pkt_type = PACKET_BROADCAST; } #endif if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } iph = (struct iphdr *)(skb->head + nh); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tunnel->dev->type == ARPHRD_ETHER) { skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; } if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip_tunnel_encap_ops **) &iptun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip_tunnel_encap_add_ops); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip_tunnel_encap_ops **) &iptun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip_tunnel_encap_del_ops); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, const struct iphdr *inner_iph, int tunnel_hlen, __be32 dst, bool md) { struct ip_tunnel *tunnel = netdev_priv(dev); int pkt_size; int mtu; tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; pkt_size = skb->len - tunnel_hlen; pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; if (df) { mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; } else { mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; } if (skb_valid_dst(skb)) skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && (inner_iph->frag_off & htons(IP_DF)) && mtu < pkt_size) { icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; } } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6; __be32 daddr; rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) : NULL; daddr = md ? dst : tunnel->parms.iph.daddr; if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { if ((daddr && !ipv4_is_multicast(daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); } } if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && mtu < pkt_size) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -E2BIG; } } #endif return 0; } void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { struct ip_tunnel *tunnel = netdev_priv(dev); u32 headroom = sizeof(struct iphdr); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; const struct iphdr *inner_iph; struct rtable *rt = NULL; struct flowi4 fl4; __be16 df = 0; u8 tos, ttl; bool use_cache; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto tx_error; key = &tun_info->key; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); inner_iph = (const struct iphdr *)skb_inner_network_header(skb); tos = key->tos; if (tos == 1) { if (skb->protocol == htons(ETH_P_IP)) tos = inner_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) tunnel_hlen = ip_encap_hlen(&tun_info->encap); if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) goto tx_error; use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { ip_rt_put(rt); goto tx_error; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = key->ttl; if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; else ttl = ip4_dst_hoplimit(&rt->dst); } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (skb_cow_head(skb, headroom)) { ip_rt_put(rt); goto tx_dropped; } ip_tunnel_adj_headroom(dev, headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); return; tx_error: DEV_STATS_INC(dev, tx_errors); goto kfree; tx_dropped: DEV_STATS_INC(dev, tx_dropped); kfree: kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info = NULL; const struct iphdr *inner_iph; unsigned int max_headroom; /* The extra header space needed */ struct rtable *rt = NULL; /* Route to the other host */ __be16 payload_protocol; bool use_cache = false; struct flowi4 fl4; bool md = false; bool connected; u8 tos, ttl; __be32 dst; __be16 df; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); payload_protocol = skb_protocol(skb, true); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ if (!skb_dst(skb)) { DEV_STATS_INC(dev, tx_fifo_errors); goto tx_error; } tun_info = skb_tunnel_info(skb); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && ip_tunnel_info_af(tun_info) == AF_INET && tun_info->key.u.ipv4.dst) { dst = tun_info->key.u.ipv4.dst; md = true; connected = true; } else if (payload_protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (payload_protocol == htons(ETH_P_IPV6)) { const struct in6_addr *addr6; struct neighbour *neigh; bool do_tx_error_icmp; int addr_type; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_error; addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) == 0) do_tx_error_icmp = true; else { do_tx_error_icmp = false; dst = addr6->s6_addr32[3]; } neigh_release(neigh); if (do_tx_error_icmp) goto tx_error_icmp; } #endif else goto tx_error; if (!md) connected = false; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; if (payload_protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; connected = false; } else if (payload_protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); connected = false; } } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) goto tx_error; if (connected && md) { use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); } else { rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : NULL; } if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl4.saddr); else if (!md && connected) dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } df = tnl_params->frag_off; if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off & htons(IP_DF)); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { ip_rt_put(rt); goto tx_error; } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = tnl_params->ttl; if (ttl == 0) { if (payload_protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; #if IS_ENABLED(CONFIG_IPV6) else if (payload_protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; #endif else ttl = ip4_dst_hoplimit(&rt->dst); } max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (skb_cow_head(skb, max_headroom)) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return; } ip_tunnel_adj_headroom(dev, max_headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); return; #if IS_ENABLED(CONFIG_IPV6) tx_error_icmp: dst_link_failure(skb); #endif tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, struct ip_tunnel_parm_kern *p, bool set_mtu, __u32 fwmark) { ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; if (dev->type != ARPHRD_ETHER) { __dev_addr_set(dev, &p->iph.saddr, 4); memcpy(dev->broadcast, &p->iph.daddr, 4); } ip_tunnel_add(itn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; WRITE_ONCE(t->parms.link, p->link); t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) WRITE_ONCE(dev->mtu, mtu); } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); if (!t) t = netdev_priv(dev); } memcpy(p, &t->parms, sizeof(*p)); break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) { if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags)) p->i_key = 0; if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags)) p->o_key = 0; } t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { t = ip_tunnel_create(net, itn, p); err = PTR_ERR_OR_ZERO(t); break; } err = -EEXIST; break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else { unsigned int nflags = 0; if (ipv4_is_multicast(p->iph.daddr)) nflags = IFF_BROADCAST; else if (p->iph.daddr) nflags = IFF_POINTOPOINT; if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { err = -EINVAL; break; } t = netdev_priv(dev); } } if (t) { err = 0; ip_tunnel_update(itn, t, dev, p, true, 0); } else { err = -ENOENT; } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == itn->fb_tunnel_dev) { err = -ENOENT; t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); if (!t) goto done; err = -EPERM; if (t == netdev_priv(itn->fb_tunnel_dev)) goto done; dev = t->dev; } unregister_netdevice(dev); err = 0; break; default: err = -EINVAL; } done: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data) { struct ip_tunnel_parm p; if (copy_from_user(&p, data, sizeof(p))) return false; strscpy(kp->name, p.name); kp->link = p.link; ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags); ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags); kp->i_key = p.i_key; kp->o_key = p.o_key; memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph))); return true; } EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp) { struct ip_tunnel_parm p; if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) || !ip_tunnel_flags_is_be16_compat(kp->o_flags)) return false; memset(&p, 0, sizeof(p)); strscpy(p.name, kp->name); p.link = kp->link; p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags); p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags); p.i_key = kp->i_key; p.o_key = kp->o_key; memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph))); return !copy_to_user(data, &p, sizeof(p)); } EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { struct ip_tunnel_parm_kern p; int err; if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); if (!err && !ip_tunnel_parm_to_user(data, &p)) return -EFAULT; return err; } EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); int max_mtu = IP_MAX_MTU - t_hlen; if (dev->type == ARPHRD_ETHER) max_mtu -= dev->hard_header_len; if (new_mtu < ETH_MIN_MTU) return -EINVAL; if (new_mtu > max_mtu) { if (strict) return -EINVAL; new_mtu = max_mtu; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { return __ip_tunnel_change_mtu(dev, new_mtu, true); } EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); dst_cache_destroy(&tunnel->dst_cache); } void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_net *itn; itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { ip_tunnel_del(itn, netdev_priv(dev)); unregister_netdevice_queue(dev, head); } } EXPORT_SYMBOL_GPL(ip_tunnel_dellink); struct net *ip_tunnel_get_link_net(const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip_tunnel_get_link_net); int ip_tunnel_get_iflink(const struct net_device *dev) { const struct ip_tunnel *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->parms.link); } EXPORT_SYMBOL(ip_tunnel_get_iflink); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); struct ip_tunnel_parm_kern parms; unsigned int i; itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); if (!ops || !net_has_fallback_tunnels(net)) { struct ip_tunnel_net *it_init_net; it_init_net = net_generic(&init_net, ip_tnl_net_id); itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } memset(&parms, 0, sizeof(parms)); if (devname) strscpy(parms.name, devname, IFNAMSIZ); rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->netns_immutable = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); void ip_tunnel_delete_net(struct net *net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *head) { struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; ASSERT_RTNL_NET(net); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); for (h = 0; h < IP_TNL_HASH_SIZE; h++) { struct ip_tunnel *t; struct hlist_node *n; struct hlist_head *thead = &itn->tunnels[h]; hlist_for_each_entry_safe(t, n, thead, hash_node) /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); } } EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *nt; struct ip_tunnel_net *itn; int mtu; int err; nt = netdev_priv(dev); itn = net_generic(net, nt->ip_tnl_net_id); if (nt->collect_md) { if (rtnl_dereference(itn->collect_md_tun)) return -EEXIST; } else { if (ip_tunnel_find(itn, p, dev->type)) return -EEXIST; } nt->net = net; nt->parms = *p; nt->fwmark = fwmark; err = register_netdevice(dev); if (err) goto err_register_netdevice; if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); mtu = ip_tunnel_bind_dev(dev); if (tb[IFLA_MTU]) { unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); if (dev->type == ARPHRD_ETHER) max -= dev->hard_header_len; mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); } err = dev_set_mtu(dev, mtu); if (err) goto err_dev_set_mtu; ip_tunnel_add(itn, nt); return 0; err_dev_set_mtu: unregister_netdevice(dev); err_register_netdevice: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); struct net *net = tunnel->net; struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) return -EINVAL; t = ip_tunnel_find(itn, p, dev->type); if (t) { if (t->dev != dev) return -EEXIST; } else { t = tunnel; if (dev->type != ARPHRD_ETHER) { unsigned int nflags = 0; if (ipv4_is_multicast(p->iph.daddr)) nflags = IFF_BROADCAST; else if (p->iph.daddr) nflags = IFF_POINTOPOINT; if ((dev->flags ^ nflags) & (IFF_POINTOPOINT | IFF_BROADCAST)) return -EINVAL; } } ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); int __ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; int err; dev->needs_free_netdev = true; dev->priv_destructor = ip_tunnel_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) return err; err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { dst_cache_destroy(&tunnel->dst_cache); return err; } tunnel->dev = dev; strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; if (tunnel->collect_md) netif_keep_dst(dev); return 0; } EXPORT_SYMBOL_GPL(__ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net *net = tunnel->net; struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); ip_tunnel_del(itn, netdev_priv(dev)); if (itn->fb_tunnel_dev == dev) WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); /* Do least required initialization, rest of init is done in tunnel_init call */ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->ip_tnl_net_id = net_id; } EXPORT_SYMBOL_GPL(ip_tunnel_setup); MODULE_DESCRIPTION("IPv4 tunnel implementation library"); MODULE_LICENSE("GPL");
4 3 3 4 1 4 4 4 3 3 2 4 5 5 1 1 1 1 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * sch_plug.c Queue traffic until an explicit release command * * There are two ways to use this qdisc: * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. * * 2. For network output buffering (a.k.a output commit) functionality. * Output commit property is commonly used by applications using checkpoint * based fault-tolerance to ensure that the checkpoint from which a system * is being restored is consistent w.r.t outside world. * * Consider for e.g. Remus - a Virtual Machine checkpointing system, * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated * asynchronously to the backup host, while the VM continues executing the * next epoch speculatively. * * The following is a typical sequence of output buffer operations: * 1.At epoch i, start_buffer(i) * 2. At end of epoch i (i.e. after 50ms): * 2.1 Stop VM and take checkpoint(i). * 2.2 start_buffer(i+1) and Resume VM * 3. While speculatively executing epoch(i+1), asynchronously replicate * checkpoint(i) to backup host. * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) * Thus, this Qdisc would receive the following sequence of commands: * TCQ_PLUG_BUFFER (epoch i) * .. TCQ_PLUG_BUFFER (epoch i+1) * ....TCQ_PLUG_RELEASE_ONE (epoch i) * ......TCQ_PLUG_BUFFER (epoch i+2) * ........ */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> /* * State of the queue, when used for network output buffering: * * plug(i+1) plug(i) head * ------------------+--------------------+----------------> * | | * | | * pkts_current_epoch| pkts_last_epoch |pkts_to_release * ----------------->|<--------+--------->|+---------------> * v v * */ struct plug_sched_data { /* If true, the dequeue function releases all packets * from head to end of the queue. The queue turns into * a pass-through queue for newly arriving packets. */ bool unplug_indefinite; bool throttled; /* Queue Limit in bytes */ u32 limit; /* Number of packets (output) from the current speculatively * executing epoch. */ u32 pkts_current_epoch; /* Number of packets corresponding to the recently finished * epoch. These will be released when we receive a * TCQ_PLUG_RELEASE_ONE command. This command is typically * issued after committing a checkpoint at the target. */ u32 pkts_last_epoch; /* * Number of packets from the head of the queue, that can * be released (committed checkpoint). */ u32 pkts_to_release; }; static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); if (likely(sch->qstats.backlog + skb->len <= q->limit)) { if (!q->unplug_indefinite) q->pkts_current_epoch++; return qdisc_enqueue_tail(skb, sch); } return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) { struct plug_sched_data *q = qdisc_priv(sch); if (q->throttled) return NULL; if (!q->unplug_indefinite) { if (!q->pkts_to_release) { /* No more packets to dequeue. Block the queue * and wait for the next release command. */ q->throttled = true; return NULL; } q->pkts_to_release--; } return qdisc_dequeue_head(sch); } static int plug_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); q->pkts_current_epoch = 0; q->pkts_last_epoch = 0; q->pkts_to_release = 0; q->unplug_indefinite = false; if (opt == NULL) { q->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; q->limit = ctl->limit; } q->throttled = true; return 0; } /* Receives 4 types of messages: * TCQ_PLUG_BUFFER: Inset a plug into the queue and * buffer any incoming packets * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head * to beginning of the next plug. * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. * Stop buffering packets until the next TCQ_PLUG_BUFFER * command is received (just act as a pass-thru queue). * TCQ_PLUG_LIMIT: Increase/decrease queue size */ static int plug_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); struct tc_plug_qopt *msg; msg = nla_data(opt); if (nla_len(opt) < sizeof(*msg)) return -EINVAL; switch (msg->action) { case TCQ_PLUG_BUFFER: /* Save size of the current buffer */ q->pkts_last_epoch = q->pkts_current_epoch; q->pkts_current_epoch = 0; if (q->unplug_indefinite) q->throttled = true; q->unplug_indefinite = false; break; case TCQ_PLUG_RELEASE_ONE: /* Add packets from the last complete buffer to the * packets to be released set. */ q->pkts_to_release += q->pkts_last_epoch; q->pkts_last_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_RELEASE_INDEFINITE: q->unplug_indefinite = true; q->pkts_to_release = 0; q->pkts_last_epoch = 0; q->pkts_current_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_LIMIT: /* Limit is supplied in bytes */ q->limit = msg->limit; break; default: return -EINVAL; } return 0; } static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .id = "plug", .priv_size = sizeof(struct plug_sched_data), .enqueue = plug_enqueue, .dequeue = plug_dequeue, .peek = qdisc_peek_dequeued, .init = plug_init, .change = plug_change, .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("plug"); static int __init plug_module_init(void) { return register_qdisc(&plug_qdisc_ops); } static void __exit plug_module_exit(void) { unregister_qdisc(&plug_qdisc_ops); } module_init(plug_module_init) module_exit(plug_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Qdisc to plug and unplug traffic via netlink control");
29 12 12 12 1 44 34 21 27 27 12 14 15 15 15 15 15 1 15 22 5 22 22 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/include/linux/jbd2.h * * Written by Stephen C. Tweedie <sct@redhat.com> * * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved * * Definitions for transaction data structures for the buffer cache * filesystem journaling support. */ #ifndef _LINUX_JBD2_H #define _LINUX_JBD2_H /* Allow this file to be included directly into e2fsprogs */ #ifndef __KERNEL__ #include "jfs_compat.h" #define JBD2_DEBUG #else #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/journal-head.h> #include <linux/stddef.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/bit_spinlock.h> #include <linux/blkdev.h> #include <linux/crc32c.h> #endif #define journal_oom_retry 1 /* * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds * certain classes of error which can occur due to failed IOs. Under * normal use we want ext4 to continue after such errors, because * hardware _can_ fail, but for debugging purposes when running tests on * known-good hardware we may want to trap these errors. */ #undef JBD2_PARANOID_IOFAIL /* * The default maximum commit age, in seconds. */ #define JBD2_DEFAULT_MAX_COMMIT_AGE 5 #ifdef CONFIG_JBD2_DEBUG /* * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal * consistency checks. By default we don't do this unless * CONFIG_JBD2_DEBUG is on. */ #define JBD2_EXPENSIVE_CHECKING void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); #define jbd2_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else #define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 #ifdef __KERNEL__ /** * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. * * All filesystem modifications made by the process go * through this handle. Recursive operations (such as quota operations) * are gathered into a single update. * * The buffer credits field is used to account for journaled buffers * being modified by the running process. To ensure that there is * enough log space for all outstanding operations, we need to limit the * number of outstanding buffers possible at any time. When the * operation completes, any buffer credits not used are credited back to * the transaction, so that at all times we know how many buffers the * outstanding updates on a transaction might possibly touch. * * This is an opaque datatype. **/ typedef struct jbd2_journal_handle handle_t; /* Atomic operation type */ /** * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. * * journal_t is linked to from the fs superblock structure. * * We use the journal_t to keep track of all outstanding transaction * activity on the filesystem, and to manage the state of the log * writing process. * * This is an opaque datatype. **/ typedef struct journal_s journal_t; /* Journal control structure */ #endif /* * Internal structures used by the logging mechanism: */ #define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ /* * On-disk structures */ /* * Descriptor block types: */ #define JBD2_DESCRIPTOR_BLOCK 1 #define JBD2_COMMIT_BLOCK 2 #define JBD2_SUPERBLOCK_V1 3 #define JBD2_SUPERBLOCK_V2 4 #define JBD2_REVOKE_BLOCK 5 /* * Standard header for all descriptor blocks: */ typedef struct journal_header_s { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; } journal_header_t; /* * Checksum types. */ #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 #define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: * * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* * fields are used to store a checksum of the descriptor and data blocks. * * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum * field is used to store crc32c(uuid+commit_block). Each journal metadata * block gets its own checksum, and data block checksums are stored in * journal_block_tag (in the descriptor). The other h_chksum* fields are * not used. * * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses * journal_block_tag3_t to store a full 32-bit checksum. Everything else * is the same as v2. * * Checksum v1, v2, and v3 are mutually exclusive features. */ struct commit_header { __be32 h_magic; __be32 h_blocktype; __be32 h_sequence; unsigned char h_chksum_type; unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; __be64 h_commit_sec; __be32 h_commit_nsec; }; /* * The block tag: used to describe a single buffer in the journal. * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this * raw struct shouldn't be used for pointer math or sizeof() - use * journal_tag_bytes(journal) instead to compute this. */ typedef struct journal_block_tag3_s { __be32 t_blocknr; /* The on-disk block number */ __be32 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ __be32 t_checksum; /* crc32c(uuid+seq+block) */ } journal_block_tag3_t; typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; /* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log */ typedef struct jbd2_journal_revoke_header_s { journal_header_t r_header; __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ #define JBD2_FLAG_DELETED 4 /* block deleted by this transaction */ #define JBD2_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ /* * The journal superblock. All fields are in big-endian byte order. */ typedef struct journal_superblock_s { /* 0x0000 */ journal_header_t s_header; /* 0x000C */ /* Static information describing the journal */ __be32 s_blocksize; /* journal device blocksize */ __be32 s_maxlen; /* total blocks in journal file */ __be32 s_first; /* first block of log information */ /* 0x0018 */ /* Dynamic information describing the current state of the log */ __be32 s_sequence; /* first commit ID expected in log */ __be32 s_start; /* blocknr of start of log */ /* 0x0020 */ /* Error value, as set by jbd2_journal_abort(). */ __be32 s_errno; /* 0x0024 */ /* Remaining fields are only valid in a version-2 superblock */ __be32 s_feature_compat; /* compatible feature set */ __be32 s_feature_incompat; /* incompatible feature set */ __be32 s_feature_ro_compat; /* readonly-compatible feature set */ /* 0x0030 */ __u8 s_uuid[16]; /* 128-bit uuid for journal */ /* 0x0040 */ __be32 s_nr_users; /* Nr of filesystems sharing log */ __be32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ /* 0x0048 */ __be32 s_max_transaction; /* Limit of journal blocks per trans.*/ __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; /* 0x0054 */ __be32 s_num_fc_blks; /* Number of fast commit blocks */ __be32 s_head; /* blocknr of head of log, only uptodate * while the filesystem is clean */ /* 0x005C */ __u32 s_padding[40]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ /* 0x0400 */ } journal_superblock_t; #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 #define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 /* See "journal feature predicate functions" below */ /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ JBD2_FEATURE_INCOMPAT_FAST_COMMIT) #ifdef __KERNEL__ #include <linux/fs.h> #include <linux/sched.h> enum jbd_state_bits { BH_JBD /* Has an attached ext3 journal_head */ = BH_PrivateStart, BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ BH_Freed, /* Has been freed (truncated) */ BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Shadow, /* IO on shadow buffer is running */ BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) BUFFER_FNS(JWrite, jwrite) BUFFER_FNS(JBDDirty, jbddirty) TAS_BUFFER_FNS(JBDDirty, jbddirty) BUFFER_FNS(Revoked, revoked) TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) BUFFER_FNS(Shadow, shadow) BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { return jh->b_bh; } static inline struct journal_head *bh2jh(struct buffer_head *bh) { return bh->b_private; } static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) { bit_spin_lock(BH_JournalHead, &bh->b_state); } static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) { bit_spin_unlock(BH_JournalHead, &bh->b_state); } #define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) #if defined(JBD2_PARANOID_IOFAIL) #define J_EXPECT(expr, why...) J_ASSERT(expr) #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) #define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) #else #define __journal_expect(expr, why...) \ ({ \ int val = (expr); \ if (!val) { \ printk(KERN_ERR \ "JBD2 unexpected failure: %s: %s;\n", \ __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ }) #define J_EXPECT(expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) #define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) #endif /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 #define __JI_WRITE_DATA 1 #define __JI_WAIT_DATA 2 /* * Commit of the inode data in progress. We use this flag to protect us from * concurrent deletion of inode. We cannot use reference to inode for this * since we cannot afford doing last iput() on behalf of kjournald */ #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) /* Write allocated dirty buffers in this inode before commit */ #define JI_WRITE_DATA (1 << __JI_WRITE_DATA) /* Wait for outstanding data writes for this inode before commit */ #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { /** * @i_transaction: * * Which transaction does this inode belong to? Either the running * transaction or the committing one. [j_list_lock] */ transaction_t *i_transaction; /** * @i_next_transaction: * * Pointer to the running transaction modifying inode's data in case * there is already a committing transaction touching it. [j_list_lock] */ transaction_t *i_next_transaction; /** * @i_list: List of inodes in the i_transaction [j_list_lock] */ struct list_head i_list; /** * @i_vfs_inode: * * VFS inode this inode belongs to [constant for lifetime of structure] */ struct inode *i_vfs_inode; /** * @i_flags: Flags of inode [j_list_lock] */ unsigned long i_flags; /** * @i_dirty_start: * * Offset in bytes where the dirty range for this inode starts. * [j_list_lock] */ loff_t i_dirty_start; /** * @i_dirty_end: * * Inclusive offset in bytes where the dirty range for this inode * ends. [j_list_lock] */ loff_t i_dirty_end; }; struct jbd2_revoke_table_s; /** * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete * type associated with handle_t. * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. * @h_total_credits: Number of remaining buffers we are allowed to add to * journal. These are dirty buffers and revoke descriptor blocks. * @h_revoke_credits: Number of remaining revoke records available for handle * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. * @h_reserved: Flag for handle for reserved credits. * @h_aborted: Flag indicating fatal error on handle. * @h_type: For handle statistics. * @h_line_no: For handle statistics. * @h_start_jiffies: Handle Start time. * @h_requested_credits: Holds @h_total_credits after handle is started. * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started. * @saved_alloc_context: Saved context while transaction is open. **/ /* Docbook can't yet cope with the bit fields, but will leave the documentation * in so it can be fixed later. */ struct jbd2_journal_handle { union { transaction_t *h_transaction; /* Which journal handle belongs to - used iff h_reserved set */ journal_t *h_journal; }; handle_t *h_rsv_handle; int h_total_credits; int h_revoke_credits; int h_revoke_credits_requested; int h_ref; int h_err; /* Flags [no locking] */ unsigned int h_sync: 1; unsigned int h_reserved: 1; unsigned int h_aborted: 1; unsigned int h_type: 8; unsigned int h_line_no: 16; unsigned long h_start_jiffies; unsigned int h_requested_credits; unsigned int saved_alloc_context; }; /* * Some stats for checkpoint phase */ struct transaction_chp_stats_s { unsigned long cs_chp_time; __u32 cs_forced_to_close; __u32 cs_written; __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It * tracks a compound transaction through its various states: * * RUNNING: accepting new updates * LOCKED: Updates still running but we don't accept new ones * RUNDOWN: Updates are tidying up but have finished requesting * new buffers to modify (state not used for now) * FLUSH: All updates complete, but we are still writing to disk * COMMIT: All data on disk, writing commit record * FINISHED: We still have to keep the transaction for checkpointing. * * The transaction keeps track of all of the buffers modified by a * running transaction, and all of the buffers committed but not yet * flushed to home for finished transactions. * (Locking Documentation improved by LockDoc) */ /* * Lock ranking: * * j_list_lock * ->jbd_lock_bh_journal_head() (This is "innermost") * * j_state_lock * ->b_state_lock * * b_state_lock * ->j_list_lock * * j_state_lock * ->j_list_lock (journal_unmap_buffer) * */ struct transaction_s { /* Pointer to the journal for this transaction. [no locking] */ journal_t *t_journal; /* Sequence number for this transaction [no locking] */ tid_t t_tid; /* * Transaction's current state * [no locking - only kjournald2 alters this] * [j_list_lock] guards transition of a transaction into T_FINISHED * state and subsequent call of __jbd2_journal_drop_transaction() * FIXME: needs barriers * KLUDGE: [use j_state_lock] */ enum { T_RUNNING, T_LOCKED, T_SWITCH, T_FLUSH, T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, T_COMMIT_CALLBACK, T_FINISHED } t_state; /* * Where in the log does this transaction's commit start? [no locking] */ unsigned long t_log_start; /* * Number of buffers on the t_buffers list [j_list_lock, no locks * needed for jbd2 thread] */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet * modified by this transaction [j_list_lock, no locks needed fo * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) * [j_list_lock] */ struct journal_head *t_forget; /* * Doubly-linked circular list of all buffers still to be flushed before * this transaction can be checkpointed. [j_list_lock] */ struct journal_head *t_checkpoint_list; /* * Doubly-linked circular list of metadata buffers being * shadowed by log IO. The IO buffers on the iobuf list and * the shadow buffers on this list match each other one for * one at all times. [j_list_lock, no locks needed for jbd2 * thread] */ struct journal_head *t_shadow_list; /* * List of inodes associated with the transaction; e.g., ext4 uses * this to track inodes in data=ordered and data=journal mode that * need special handling on transaction commit; also used by ocfs2. * [j_list_lock] */ struct list_head t_inode_list; /* * Longest time some handle had to wait for running transaction */ unsigned long t_max_wait; /* * When transaction started */ unsigned long t_start; /* * When commit was requested [j_state_lock] */ unsigned long t_requested; /* * Checkpointing stats [j_list_lock] */ struct transaction_chp_stats_s t_chp_stats; /* * Number of outstanding updates running on this transaction * [none] */ atomic_t t_updates; /* * Number of blocks reserved for this transaction in the journal. * This is including all credits reserved when starting transaction * handles as well as all journal descriptor blocks needed for this * transaction. [none] */ atomic_t t_outstanding_credits; /* * Number of revoke records for this transaction added by already * stopped handles. [none] */ atomic_t t_outstanding_revokes; /* * How many handles used this transaction? [none] */ atomic_t t_handle_count; /* * Forward and backward links for the circular list of all transactions * awaiting checkpoint. [j_list_lock] */ transaction_t *t_cpnext, *t_cpprev; /* * When will the transaction expire (become due for commit), in jiffies? * [no locking] */ unsigned long t_expires; /* * When this transaction started, in nanoseconds [no locking] */ ktime_t t_start_time; /* * This transaction is being forced and some process is * waiting for it to finish. */ unsigned int t_synchronous_commit:1; /* Disk flush needs to be sent to fs partition [no locking] */ int t_need_data_flush; }; struct transaction_run_stats_s { unsigned long rs_wait; unsigned long rs_request_delay; unsigned long rs_running; unsigned long rs_locked; unsigned long rs_flushing; unsigned long rs_logging; __u32 rs_handle_count; __u32 rs_blocks; __u32 rs_blocks_logged; }; struct transaction_stats_s { unsigned long ts_tid; unsigned long ts_requested; struct transaction_run_stats_s run; }; static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { if (end >= start) return end - start; return end + (MAX_JIFFY_OFFSET - start); } #define JBD2_NR_BATCH 64 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; #define JBD2_FC_REPLAY_STOP 0 #define JBD2_FC_REPLAY_CONTINUE 1 /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. */ struct journal_s { /** * @j_flags: General journaling state flags [j_state_lock, * no lock for quick racy checks] */ unsigned long j_flags; /** * @j_errno: * * Is there an outstanding uncleared error on the journal (from a prior * abort)? [j_state_lock] */ int j_errno; /** * @j_abort_mutex: Lock the whole aborting procedure. */ struct mutex j_abort_mutex; /** * @j_sb_buffer: The first part of the superblock buffer. */ struct buffer_head *j_sb_buffer; /** * @j_superblock: The second part of the superblock buffer. */ journal_superblock_t *j_superblock; /** * @j_state_lock: Protect the various scalars in the journal. */ rwlock_t j_state_lock; /** * @j_barrier_count: * * Number of processes waiting to create a barrier lock [j_state_lock, * no lock for quick racy checks] */ int j_barrier_count; /** * @j_barrier: The barrier lock itself. */ struct mutex j_barrier; /** * @j_running_transaction: * * Transactions: The current running transaction... * [j_state_lock, no lock for quick racy checks] [caller holding * open handle] */ transaction_t *j_running_transaction; /** * @j_committing_transaction: * * the transaction we are pushing to disk * [j_state_lock] [caller holding open handle] */ transaction_t *j_committing_transaction; /** * @j_checkpoint_transactions: * * ... and a linked circular list of all transactions waiting for * checkpointing. [j_list_lock] */ transaction_t *j_checkpoint_transactions; /** * @j_wait_transaction_locked: * * Wait queue for waiting for a locked transaction to start committing, * or for a barrier lock to be released. */ wait_queue_head_t j_wait_transaction_locked; /** * @j_wait_done_commit: Wait queue for waiting for commit to complete. */ wait_queue_head_t j_wait_done_commit; /** * @j_wait_commit: Wait queue to trigger commit. */ wait_queue_head_t j_wait_commit; /** * @j_wait_updates: Wait queue to wait for updates to complete. */ wait_queue_head_t j_wait_updates; /** * @j_wait_reserved: * * Wait queue to wait for reserved buffer credits to drop. */ wait_queue_head_t j_wait_reserved; /** * @j_fc_wait: * * Wait queue to wait for completion of async fast commits. */ wait_queue_head_t j_fc_wait; /** * @j_checkpoint_mutex: * * Semaphore for locking against concurrent checkpoints. */ struct mutex j_checkpoint_mutex; /** * @j_chkpt_bhs: * * List of buffer heads used by the checkpoint routine. This * was moved from jbd2_log_do_checkpoint() to reduce stack * usage. Access to this array is controlled by the * @j_checkpoint_mutex. [j_checkpoint_mutex] */ struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; /** * @j_shrinker: * * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: * * Number of journal buffers on the checkpoint list. [j_list_lock] */ struct percpu_counter j_checkpoint_jh_count; /** * @j_shrink_transaction: * * Record next transaction will shrink on the checkpoint list. * [j_list_lock] */ transaction_t *j_shrink_transaction; /** * @j_head: * * Journal head: identifies the first unused block in the journal. * [j_state_lock] */ unsigned long j_head; /** * @j_tail: * * Journal tail: identifies the oldest still-used block in the journal. * [j_state_lock] */ unsigned long j_tail; /** * @j_free: * * Journal free: how many free blocks are there in the journal? * [j_state_lock] */ unsigned long j_free; /** * @j_first: * * The block number of the first usable block in the journal * [j_state_lock]. */ unsigned long j_first; /** * @j_last: * * The block number one beyond the last usable block in the journal * [j_state_lock]. */ unsigned long j_last; /** * @j_fc_first: * * The block number of the first fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_first; /** * @j_fc_off: * * Number of fast commit blocks currently allocated. Accessed only * during fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ unsigned long j_fc_off; /** * @j_fc_last: * * The block number one beyond the last fast commit block in the journal * [j_state_lock]. */ unsigned long j_fc_last; /** * @j_dev: Device where we store the journal. */ struct block_device *j_dev; /** * @j_blocksize: Block size for the location where we store the journal. */ int j_blocksize; /** * @j_blk_offset: * * Starting block offset into the device where we store the journal. */ unsigned long long j_blk_offset; /** * @j_devname: Journal device name. */ char j_devname[BDEVNAME_SIZE+24]; /** * @j_fs_dev: * * Device which holds the client fs. For internal journal this will be * equal to j_dev. */ struct block_device *j_fs_dev; /** * @j_fs_dev_wb_err: * * Records the errseq of the client fs's backing block device. */ errseq_t j_fs_dev_wb_err; /** * @j_total_len: Total maximum capacity of the journal region on disk. */ unsigned int j_total_len; /** * @j_reserved_credits: * * Number of buffers reserved from the running transaction. */ atomic_t j_reserved_credits; /** * @j_list_lock: Protects the buffer lists and internal buffer state. */ spinlock_t j_list_lock; /** * @j_inode: * * Optional inode where we store the journal. If present, all * journal block numbers are mapped into this inode via bmap(). */ struct inode *j_inode; /** * @j_tail_sequence: * * Sequence number of the oldest transaction in the log [j_state_lock] */ tid_t j_tail_sequence; /** * @j_transaction_sequence: * * Sequence number of the next transaction to grant [j_state_lock] */ tid_t j_transaction_sequence; /** * @j_commit_sequence: * * Sequence number of the most recently committed transaction * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; /** * @j_commit_request: * * Sequence number of the most recent transaction wanting commit * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; /** * @j_uuid: * * Journal uuid: identifies the object (filesystem, LVM volume etc) * backed by this journal. This will eventually be replaced by an array * of uuids, allowing us to index multiple devices within a single * journal and to perform atomic updates across them. */ __u8 j_uuid[16]; /** * @j_task: Pointer to the current commit thread for this journal. */ struct task_struct *j_task; /** * @j_max_transaction_buffers: * * Maximum number of metadata buffers to allow in a single compound * commit transaction. */ int j_max_transaction_buffers; /** * @j_revoke_records_per_block: * * Number of revoke records that fit in one descriptor block. */ int j_revoke_records_per_block; /** * @j_transaction_overhead_buffers: * * Number of blocks each transaction needs for its own bookkeeping */ int j_transaction_overhead_buffers; /** * @j_commit_interval: * * What is the maximum transaction lifetime before we begin a commit? */ unsigned long j_commit_interval; /** * @j_commit_timer: The timer used to wakeup the commit thread. */ struct timer_list j_commit_timer; /** * @j_revoke_lock: Protect the revoke table. */ spinlock_t j_revoke_lock; /** * @j_revoke: * * The revoke table - maintains the list of revoked blocks in the * current transaction. */ struct jbd2_revoke_table_s *j_revoke; /** * @j_revoke_table: Alternate revoke tables for j_revoke. */ struct jbd2_revoke_table_s *j_revoke_table[2]; /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. */ struct buffer_head **j_wbuf; /** * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only * during a fast commit. Currently only process can do fast commit, so * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; /** * @j_wbufsize: * * Size of @j_wbuf array. */ int j_wbufsize; /** * @j_fc_wbufsize: * * Size of @j_fc_wbuf array. */ int j_fc_wbufsize; /** * @j_last_sync_writer: * * The pid of the last person to run a synchronous operation * through the journal. */ pid_t j_last_sync_writer; /** * @j_average_commit_time: * * The average amount of time in nanoseconds it takes to commit a * transaction to disk. [j_state_lock] */ u64 j_average_commit_time; /** * @j_min_batch_time: * * Minimum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_min_batch_time; /** * @j_max_batch_time: * * Maximum time that we should wait for additional filesystem operations * to get batched into a synchronous handle in microseconds. */ u32 j_max_batch_time; /** * @j_commit_callback: * * This function is called when a transaction is closed. */ void (*j_commit_callback)(journal_t *, transaction_t *); /** * @j_submit_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WRITE_DATA flag * before we start to write out the transaction to the journal. */ int (*j_submit_inode_data_buffers) (struct jbd2_inode *); /** * @j_finish_inode_data_buffers: * * This function is called for all inodes associated with the * committing transaction marked with JI_WAIT_DATA flag * after we have written the transaction to the journal * but before we write out the commit block. */ int (*j_finish_inode_data_buffers) (struct jbd2_inode *); /* * Journal statistics */ /** * @j_history_lock: Protect the transactions statistics history. */ spinlock_t j_history_lock; /** * @j_proc_entry: procfs entry for the jbd statistics directory. */ struct proc_dir_entry *j_proc_entry; /** * @j_stats: Overall statistics. */ struct transaction_stats_s j_stats; /** * @j_failed_commit: Failed journal commit ID. */ unsigned int j_failed_commit; /** * @j_private: * * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here. */ void *j_private; /** * @j_csum_seed: * * Precomputed journal UUID checksum for seeding other checksums. */ __u32 j_csum_seed; #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * @j_trans_commit_map: * * Lockdep entity to track transaction commit dependencies. Handles * hold this "lock" for read, when we wait for commit, we acquire the * "lock" for writing. This matches the properties of jbd2 journalling * where the running transaction has to wait for all handles to be * dropped to commit that transaction and also acquiring a handle may * require transaction commit to finish. */ struct lockdep_map j_trans_commit_map; #endif /** * @jbd2_trans_commit_key: * * "struct lock_class_key" for @j_trans_commit_map */ struct lock_class_key jbd2_trans_commit_key; /** * @j_fc_cleanup_callback: * * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast * commit. JBD2 calls this function for each fast commit block found in * the journal. This function should return JBD2_FC_REPLAY_CONTINUE * to indicate that the block was processed correctly and more fast * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP * indicates the end of replay (no more blocks remaining). A negative * return value indicates error. */ int (*j_fc_replay_callback)(struct journal_s *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_commit_id); /** * @j_bmap: * * Bmap function that should be used instead of the generic * VFS bmap function. */ int (*j_bmap)(struct journal_s *journal, sector_t *block); }; #define jbd2_might_wait_for_commit(j) \ do { \ rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) /* * We can support any known requested features iff the * superblock is not in version 1. Otherwise we fail to support any * extended sb features. */ static inline bool jbd2_format_support_feature(journal_t *j) { return j->j_superblock->s_header.h_blocktype != cpu_to_be32(JBD2_SUPERBLOCK_V1); } /* journal feature predicate functions */ #define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_compat & \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat |= \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_compat &= \ ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \ } #define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_ro_compat & \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat |= \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_ro_compat &= \ ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \ } #define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_incompat & \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void jbd2_set_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat |= \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } \ static inline void jbd2_clear_feature_##name(journal_t *j) \ { \ (j)->j_superblock->s_feature_incompat &= \ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \ } JBD2_FEATURE_COMPAT_FUNCS(checksum, CHECKSUM) JBD2_FEATURE_INCOMPAT_FUNCS(revoke, REVOKE) JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) /* Journal high priority write IO operation flags */ #define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) /* * Journal flag definitions */ #define JBD2_UNMOUNT 0x001 /* Journal thread is being destroyed */ #define JBD2_ABORT 0x002 /* Journaling has been aborted for errors. */ #define JBD2_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on * clean and empty filesystem * logging area */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002 #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) /* * Function declarations for the journaling transaction and buffer * management */ /* Filing buffers */ extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) { list_add_tail(&bh->b_assoc_buffers, head); } static inline void jbd2_unfile_log_bh(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); } /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP}; void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); /* * Triggers */ struct jbd2_buffer_trigger_type { /* * Fired a the moment data to write to the journal are known to be * stable - so either at the moment b_frozen_data is created or just * before a buffer is written to the journal. mapped_data is a mapped * buffer that is the frozen data for commit. */ void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); /* * Fired during journal abort for dirty buffers that will not be * committed. */ void (*t_abort)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh); }; extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, struct jbd2_buffer_trigger_type *triggers); /* Buffer IO */ extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr); /* Transaction cache support */ extern void jbd2_journal_destroy_transaction_cache(void); extern int __init jbd2_journal_init_transaction_cache(void); extern void jbd2_journal_free_transaction(transaction_t *); /* * Journal locking. * * We need to lock the journal during transaction state changes so that nobody * ever tries to take a handle on the running transaction while we are in the * middle of moving it to the commit phase. j_state_lock does this. * * Note that the locking is completely interrupt unsafe. We never touch * journal structures from interrupts. */ static inline handle_t *journal_current_handle(void) { return current->journal_info; } /* The journaling code user interface: * * Create and destroy handles * Register buffer modifications against the current transaction. */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks, int revoke_records, gfp_t gfp_mask, unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); extern int jbd2__journal_restart(handle_t *, int nblocks, int revoke_records, gfp_t gfp_mask); extern int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, unsigned int line_no); extern void jbd2_journal_free_reserved(handle_t *handle); extern int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); extern void jbd2_journal_unlock_updates (journal_t *); void jbd2_journal_wait_updates(journal_t *); extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int bsize); extern journal_t * jbd2_journal_init_inode (struct inode *); extern int jbd2_journal_update_format (journal_t *); extern int jbd2_journal_check_used_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_check_available_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, blk_opf_t); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh); struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh); void jbd2_journal_put_journal_head(struct journal_head *jh); /* * handle management */ extern struct kmem_cache *jbd2_handle_cache; /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define jbd2_alloc_handle(_gfp_flags) \ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags)) static inline void jbd2_free_handle(handle_t *handle) { kmem_cache_free(jbd2_handle_cache, handle); } /* * jbd2_inode management (optional, for those file systems that want to use * dynamically allocated jbd2_inode structures) */ extern struct kmem_cache *jbd2_inode_cache; /* * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define jbd2_alloc_inode(_gfp_flags) \ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags)) static inline void jbd2_free_inode(struct jbd2_inode *jinode) { kmem_cache_free(jbd2_inode_cache, jinode); } /* Primary revoke support */ #define JOURNAL_REVOKE_DEFAULT_HASH 256 extern int jbd2_journal_init_revoke(journal_t *, int); extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); extern int __init jbd2_journal_init_revoke_table_cache(void); struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size); void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern void jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); extern void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); extern int jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t); extern void jbd2_journal_clear_revoke(journal_t *); extern void jbd2_journal_switch_revoke_table(journal_t *journal); extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); /* * The log thread user interface: * * Request space in the current transaction, and force transaction commit * transitions on demand. */ int jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_transaction_committed(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); void jbd2_fc_release_bufs(journal_t *journal); /* * is_journal_abort * * Simple test wrapper function to test the JBD2_ABORT state flag. This * bit, when set, indicates that we have had a fatal error somewhere, * either inside the journaling layer or indicated to us by the client * (eg. ext3), and that we and should not commit any further * transactions. */ static inline int is_journal_aborted(journal_t *journal) { return journal->j_flags & JBD2_ABORT; } static inline int is_handle_aborted(handle_t *handle) { if (handle->h_aborted || !handle->h_transaction) return 1; return is_journal_aborted(handle->h_transaction->t_journal); } static inline void jbd2_journal_abort_handle(handle_t *handle) { handle->h_aborted = 1; } static inline void jbd2_init_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_mapping; /* * Save the original wb_err value of client fs's bdev mapping which * could be used to detect the client fs's metadata async write error. */ errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); } static inline int jbd2_check_fs_dev_write_error(journal_t *journal) { struct address_space *mapping = journal->j_fs_dev->bd_mapping; return errseq_check(&mapping->wb_err, READ_ONCE(journal->j_fs_dev_wb_err)); } #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using * modulo arithmetic so that they work over sequence number wraps. */ static inline int tid_gt(tid_t x, tid_t y) { int difference = (x - y); return (difference > 0); } static inline int tid_geq(tid_t x, tid_t y) { int difference = (x - y); return (difference >= 0); } extern int jbd2_journal_blocks_per_folio(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { return jbd2_has_feature_csum2(journal) || jbd2_has_feature_csum3(journal); } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) { int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; } /* * Return number of free blocks in the log. Must be called under j_state_lock. */ static inline unsigned long jbd2_log_space_left(journal_t *journal) { /* Allow for rounding errors */ long free = journal->j_free - 32; if (journal->j_committing_transaction) { free -= atomic_read(&journal-> j_committing_transaction->t_outstanding_credits); } return max_t(long, free, 0); } /* * Definitions which augment the buffer_head layer */ /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ #define BJ_Metadata 1 /* Normal journaled metadata */ #define BJ_Forget 2 /* Buffer superseded by this transaction */ #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 static inline u32 jbd2_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } /* Return most recent uncommitted transaction */ static inline tid_t jbd2_get_latest_transaction(journal_t *journal) { tid_t tid; read_lock(&journal->j_state_lock); tid = journal->j_commit_request; if (journal->j_running_transaction) tid = journal->j_running_transaction->t_tid; read_unlock(&journal->j_state_lock); return tid; } static inline int jbd2_handle_buffer_credits(handle_t *handle) { journal_t *journal; if (!handle->h_reserved) journal = handle->h_transaction->t_journal; else journal = handle->h_journal; return handle->h_total_credits - DIV_ROUND_UP(handle->h_revoke_credits_requested, journal->j_revoke_records_per_block); } #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) #define print_buffer_fields(bh) do {} while (0) #define print_buffer_trace(bh) do {} while (0) #define BUFFER_TRACE(bh, info) do {} while (0) #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) #endif /* __KERNEL__ */ #endif /* _LINUX_JBD2_H */
14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/mac80211/util.c */ #include "ieee802154_i.h" #include "driver-ops.h" /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; /** * ieee802154_wake_queue - wake ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we had to stop the queue to * avoid new skb to come during the transmission. The queue then needs to be * woken up after the operation. */ static void ieee802154_wake_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_wake_queue(sdata->dev); } rcu_read_unlock(); } /** * ieee802154_stop_queue - stop ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we need to tell upper layers to * stop giving us new skbs while we are busy with the transmitted one. The queue * must then be stopped before transmitting. */ static void ieee802154_stop_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_stop_queue(sdata->dev); } rcu_read_unlock(); } void ieee802154_hold_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (!atomic_fetch_inc(&local->phy->hold_txs)) ieee802154_stop_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_release_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (atomic_dec_and_test(&local->phy->hold_txs)) ieee802154_wake_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_disable_queue(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_tx_disable(sdata->dev); } rcu_read_unlock(); } enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = container_of(timer, struct ieee802154_local, ifs_timer); ieee802154_release_queue(local); return HRTIMER_NORESTART; } void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, bool ifs_handling) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = IEEE802154_SUCCESS; if (ifs_handling) { u8 max_sifs_size; /* If transceiver sets CRC on his own we need to use lifs * threshold len above 16 otherwise 18, because it's not * part of skb->len. */ if (hw->flags & IEEE802154_HW_TX_OMIT_CKSUM) max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE - IEEE802154_FCS_LEN; else max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE; if (skb->len > max_sifs_size) hrtimer_start(&local->ifs_timer, hw->phy->lifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); else hrtimer_start(&local->ifs_timer, hw->phy->sifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); } else { ieee802154_release_queue(local); } dev_consume_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, int reason) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = reason; ieee802154_release_queue(local); dev_kfree_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb) { ieee802154_xmit_error(hw, skb, IEEE802154_SYSTEM_ERROR); } EXPORT_SYMBOL(ieee802154_xmit_hw_error); void ieee802154_stop_device(struct ieee802154_local *local) { flush_workqueue(local->workqueue); hrtimer_cancel(&local->ifs_timer); drv_stop(local); }
13 270 271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright © 2006-2014 Intel Corporation. * * Authors: David Woodhouse <dwmw2@infradead.org>, * Ashok Raj <ashok.raj@intel.com>, * Shaohua Li <shaohua.li@intel.com>, * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>, * Fenghua Yu <fenghua.yu@intel.com> * Joerg Roedel <jroedel@suse.de> */ #define pr_fmt(fmt) "DMAR: " fmt #define dev_fmt(fmt) pr_fmt(fmt) #include <linux/crash_dump.h> #include <linux/dma-direct.h> #include <linux/dmi.h> #include <linux/memory.h> #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/spinlock.h> #include <linux/syscore_ops.h> #include <linux/tboot.h> #include <uapi/linux/iommufd.h> #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" #include "../iommu-pages.h" #include "pasid.h" #include "perfmon.h" #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE #define IS_GFX_DEVICE(pdev) pci_is_display(pdev) #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) #define IOAPIC_RANGE_START (0xfee00000) #define IOAPIC_RANGE_END (0xfeefffff) #define IOVA_START_ADDR (0x1000) #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 static void __init check_tylersburg_isoch(void); static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable); static int rwbf_quirk; #define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) */ static int force_on = 0; static int intel_iommu_tboot_noforce; static int no_platform_optin; #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) /* * Take a root_entry and return the Lower Context Table Pointer (LCTP) * if marked present. */ static phys_addr_t root_entry_lctp(struct root_entry *re) { if (!(re->lo & 1)) return 0; return re->lo & VTD_PAGE_MASK; } /* * Take a root_entry and return the Upper Context Table Pointer (UCTP) * if marked present. */ static phys_addr_t root_entry_uctp(struct root_entry *re) { if (!(re->hi & 1)) return 0; return re->hi & VTD_PAGE_MASK; } static int device_rid_cmp_key(const void *key, const struct rb_node *node) { struct device_domain_info *info = rb_entry(node, struct device_domain_info, node); const u16 *rid_lhs = key; if (*rid_lhs < PCI_DEVID(info->bus, info->devfn)) return -1; if (*rid_lhs > PCI_DEVID(info->bus, info->devfn)) return 1; return 0; } static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs) { struct device_domain_info *info = rb_entry(lhs, struct device_domain_info, node); u16 key = PCI_DEVID(info->bus, info->devfn); return device_rid_cmp_key(&key, rhs); } /* * Looks up an IOMMU-probed device using its source ID. * * Returns the pointer to the device if there is a match. Otherwise, * returns NULL. * * Note that this helper doesn't guarantee that the device won't be * released by the iommu subsystem after being returned. The caller * should use its own synchronization mechanism to avoid the device * being released during its use if its possibly the case. */ struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid) { struct device_domain_info *info = NULL; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&iommu->device_rbtree_lock, flags); node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key); if (node) info = rb_entry(node, struct device_domain_info, node); spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); return info ? info->dev : NULL; } static int device_rbtree_insert(struct intel_iommu *iommu, struct device_domain_info *info) { struct rb_node *curr; unsigned long flags; spin_lock_irqsave(&iommu->device_rbtree_lock, flags); curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp); spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); if (WARN_ON(curr)) return -EEXIST; return 0; } static void device_rbtree_remove(struct device_domain_info *info) { struct intel_iommu *iommu = info->iommu; unsigned long flags; spin_lock_irqsave(&iommu->device_rbtree_lock, flags); rb_erase(&info->node, &iommu->device_rbtree); spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); } struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ u64 base_address; /* reserved base address*/ u64 end_address; /* reserved end address */ struct dmar_dev_scope *devices; /* target devices */ int devices_cnt; /* target device count */ }; struct dmar_atsr_unit { struct list_head list; /* list of ATSR units */ struct acpi_dmar_header *hdr; /* ACPI header */ struct dmar_dev_scope *devices; /* target devices */ int devices_cnt; /* target device count */ u8 include_all:1; /* include all ports */ }; struct dmar_satc_unit { struct list_head list; /* list of SATC units */ struct acpi_dmar_header *hdr; /* ACPI header */ struct dmar_dev_scope *devices; /* target devices */ struct intel_iommu *iommu; /* the corresponding iommu */ int devices_cnt; /* target device count */ u8 atc_required:1; /* ATS is required */ }; static LIST_HEAD(dmar_atsr_units); static LIST_HEAD(dmar_rmrr_units); static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); int intel_iommu_enabled = 0; EXPORT_SYMBOL_GPL(intel_iommu_enabled); static int intel_iommu_superpage = 1; static int iommu_identity_mapping; static int iommu_skip_te_disable; static int disable_igfx_iommu; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED); } static void clear_translation_pre_enabled(struct intel_iommu *iommu) { iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED; } static void init_translation_status(struct intel_iommu *iommu) { u32 gsts; gsts = readl(iommu->reg + DMAR_GSTS_REG); if (gsts & DMA_GSTS_TES) iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; } static int __init intel_iommu_setup(char *str) { if (!str) return -EINVAL; while (*str) { if (!strncmp(str, "on", 2)) { dmar_disabled = 0; pr_info("IOMMU enabled\n"); } else if (!strncmp(str, "off", 3)) { dmar_disabled = 1; no_platform_optin = 1; pr_info("IOMMU disabled\n"); } else if (!strncmp(str, "igfx_off", 8)) { disable_igfx_iommu = 1; pr_info("Disable GFX device mapping\n"); } else if (!strncmp(str, "forcedac", 8)) { pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); iommu_dma_forcedac = true; } else if (!strncmp(str, "strict", 6)) { pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); iommu_set_dma_strict(); } else if (!strncmp(str, "sp_off", 6)) { pr_info("Disable supported super page\n"); intel_iommu_superpage = 0; } else if (!strncmp(str, "sm_on", 5)) { pr_info("Enable scalable mode if hardware supports\n"); intel_iommu_sm = 1; } else if (!strncmp(str, "sm_off", 6)) { pr_info("Scalable mode is disallowed\n"); intel_iommu_sm = 0; } else if (!strncmp(str, "tboot_noforce", 13)) { pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); intel_iommu_tboot_noforce = 1; } else { pr_notice("Unknown option - '%s'\n", str); } str += strcspn(str, ","); while (*str == ',') str++; } return 1; } __setup("intel_iommu=", intel_iommu_setup); /* * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of * the returned SAGAW. */ static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) { unsigned long fl_sagaw, sl_sagaw; fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0); sl_sagaw = cap_sagaw(iommu->cap); /* Second level only. */ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return sl_sagaw; /* First level only. */ if (!ecap_slts(iommu->ecap)) return fl_sagaw; return fl_sagaw & sl_sagaw; } static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; int agaw; sagaw = __iommu_calculate_sagaw(iommu); for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break; } return agaw; } /* * Calculate max SAGAW for each iommu. */ int iommu_calculate_max_sagaw(struct intel_iommu *iommu) { return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); } /* * calculate agaw for each iommu. * "SAGAW" may be different across iommus, use a default agaw, and * get a supported less agaw for iommus that don't support the default agaw. */ int iommu_calculate_agaw(struct intel_iommu *iommu) { return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); } static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) { return sm_supported(iommu) ? ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { struct root_entry *root = &iommu->root_entry[bus]; struct context_entry *context; u64 *entry; /* * Except that the caller requested to allocate a new entry, * returning a copied context entry makes no sense. */ if (!alloc && context_copied(iommu, bus, devfn)) return NULL; entry = &root->lo; if (sm_supported(iommu)) { if (devfn >= 0x80) { devfn -= 0x80; entry = &root->hi; } devfn *= 2; } if (*entry & 1) context = phys_to_virt(*entry & VTD_PAGE_MASK); else { unsigned long phy_addr; if (!alloc) return NULL; context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); if (!context) return NULL; __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); phy_addr = virt_to_phys((void *)context); *entry = phy_addr | 1; __iommu_flush_cache(iommu, entry, sizeof(*entry)); } return &context[devfn]; } /** * is_downstream_to_pci_bridge - test if a device belongs to the PCI * sub-hierarchy of a candidate PCI-PCI bridge * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy * @bridge: the candidate PCI-PCI bridge * * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false. */ static bool is_downstream_to_pci_bridge(struct device *dev, struct device *bridge) { struct pci_dev *pdev, *pbridge; if (!dev_is_pci(dev) || !dev_is_pci(bridge)) return false; pdev = to_pci_dev(dev); pbridge = to_pci_dev(bridge); if (pbridge->subordinate && pbridge->subordinate->number <= pdev->bus->number && pbridge->subordinate->busn_res.end >= pdev->bus->number) return true; return false; } static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev) { struct dmar_drhd_unit *drhd; u32 vtbar; int rc; /* We know that this device on this chipset has its own IOMMU. * If we find it under a different IOMMU, then the BIOS is lying * to us. Hope that the IOMMU for this device is actually * disabled, and it needs no translation... */ rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar); if (rc) { /* "can't" happen */ dev_info(&pdev->dev, "failed to run vt-d quirk\n"); return false; } vtbar &= 0xffff0000; /* we know that the this iommu should be at offset 0xa000 from vtbar */ drhd = dmar_find_matched_drhd_unit(pdev); if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) { pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); return true; } return false; } static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) { if (!iommu || iommu->drhd->ignored) return true; if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); if (pdev->vendor == PCI_VENDOR_ID_INTEL && pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB && quirk_ioat_snb_local_iommu(pdev)) return true; } return false; } static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) { struct dmar_drhd_unit *drhd = NULL; struct pci_dev *pdev = NULL; struct intel_iommu *iommu; struct device *tmp; u16 segment = 0; int i; if (!dev) return NULL; if (dev_is_pci(dev)) { struct pci_dev *pf_pdev; pdev = pci_real_dma_dev(to_pci_dev(dev)); /* VFs aren't listed in scope tables; we need to look up * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); dev = &pf_pdev->dev; segment = pci_domain_nr(pdev->bus); } else if (has_acpi_companion(dev)) dev = &ACPI_COMPANION(dev)->dev; rcu_read_lock(); for_each_iommu(iommu, drhd) { if (pdev && segment != drhd->segment) continue; for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, tmp) { if (tmp == dev) { /* For a VF use its original BDF# not that of the PF * which we used for the IOMMU lookup. Strictly speaking * we could do this for all PCI devices; we only need to * get the BDF# from the scope table for ACPI matches. */ if (pdev && pdev->is_virtfn) goto got_pdev; if (bus && devfn) { *bus = drhd->devices[i].bus; *devfn = drhd->devices[i].devfn; } goto out; } if (is_downstream_to_pci_bridge(dev, tmp)) goto got_pdev; } if (pdev && drhd->include_all) { got_pdev: if (bus && devfn) { *bus = pdev->bus->number; *devfn = pdev->devfn; } goto out; } } iommu = NULL; out: if (iommu_is_dummy(iommu, dev)) iommu = NULL; rcu_read_unlock(); return iommu; } static void free_context_table(struct intel_iommu *iommu) { struct context_entry *context; int i; if (!iommu->root_entry) return; for (i = 0; i < ROOT_ENTRY_NR; i++) { context = iommu_context_addr(iommu, i, 0, 0); if (context) iommu_free_pages(context); if (!sm_supported(iommu)) continue; context = iommu_context_addr(iommu, i, 0x80, 0); if (context) iommu_free_pages(context); } iommu_free_pages(iommu->root_entry); iommu->root_entry = NULL; } #ifdef CONFIG_DMAR_DEBUG static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn, struct dma_pte *parent, int level) { struct dma_pte *pte; int offset; while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); if (!dma_pte_present(pte)) { pr_info("page table not present at level %d\n", level - 1); break; } if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); level--; } } void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, unsigned long long addr, u32 pasid) { struct pasid_dir_entry *dir, *pde; struct pasid_entry *entries, *pte; struct context_entry *ctx_entry; struct root_entry *rt_entry; int i, dir_index, index, level; u8 devfn = source_id & 0xff; u8 bus = source_id >> 8; struct dma_pte *pgtable; pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ if (!iommu->root_entry) { pr_info("root table is not present\n"); return; } rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", rt_entry->hi, rt_entry->lo); else pr_info("root entry: 0x%016llx", rt_entry->lo); /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { pr_info("context table is not present\n"); return; } pr_info("context entry: hi 0x%016llx, low 0x%016llx\n", ctx_entry->hi, ctx_entry->lo); /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { if (!context_present(ctx_entry)) { pr_info("legacy mode page table is not present\n"); return; } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } if (!context_present(ctx_entry)) { pr_info("pasid directory table is not present\n"); return; } /* get the pointer to pasid directory entry */ dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; dir_index = pasid >> PASID_PDE_SHIFT; pde = &dir[dir_index]; pr_info("pasid dir entry: 0x%016llx\n", pde->val); /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; pte = &entries[index]; for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); if (!pasid_pte_is_present(pte)) { pr_info("scalable mode page table is not present\n"); return; } if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); } else { level = agaw_to_level((pte->val[0] >> 2) & 0x7); pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); } pgtable_walk: pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); } #endif /* iommu handling */ static int iommu_alloc_root_entry(struct intel_iommu *iommu) { struct root_entry *root; root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); if (!root) { pr_err("Allocating root entry for %s failed\n", iommu->name); return -ENOMEM; } __iommu_flush_cache(iommu, root, ROOT_SIZE); iommu->root_entry = root; return 0; } static void iommu_set_root_entry(struct intel_iommu *iommu) { u64 addr; u32 sts; unsigned long flag; addr = virt_to_phys(iommu->root_entry); if (sm_supported(iommu)) addr |= DMA_RTADDR_SMT; raw_spin_lock_irqsave(&iommu->register_lock, flag); writeq(addr, iommu->reg + DMAR_RTADDR_REG); writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); /* * Hardware invalidates all DMA remapping hardware translation * caches as part of SRTP flow. */ if (cap_esrtps(iommu->cap)) return; iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); if (sm_supported(iommu)) qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } void iommu_flush_write_buffer(struct intel_iommu *iommu) { u32 val; unsigned long flag; if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; raw_spin_lock_irqsave(&iommu->register_lock, flag); writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (!(val & DMA_GSTS_WBFS)), val); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } /* return value determine if we need a write buffer flush */ static void __iommu_flush_context(struct intel_iommu *iommu, u16 did, u16 source_id, u8 function_mask, u64 type) { u64 val = 0; unsigned long flag; switch (type) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; break; case DMA_CCMD_DOMAIN_INVL: val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); break; case DMA_CCMD_DEVICE_INVL: val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); break; default: pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n", iommu->name, type); return; } val |= DMA_CCMD_ICC; raw_spin_lock_irqsave(&iommu->register_lock, flag); writeq(val, iommu->reg + DMAR_CCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, readq, (!(val & DMA_CCMD_ICC)), val); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; unsigned long flag; switch (type) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */ val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; break; case DMA_TLB_DSI_FLUSH: val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); break; case DMA_TLB_PSI_FLUSH: val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); /* IH bit is passed in as part of address */ val_iva = size_order | addr; break; default: pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n", iommu->name, type); return; } if (cap_write_drain(iommu->cap)) val |= DMA_TLB_WRITE_DRAIN; raw_spin_lock_irqsave(&iommu->register_lock, flag); /* Note: Only uses first TLB reg currently */ if (val_iva) writeq(val_iva, iommu->reg + tlb_offset); writeq(val, iommu->reg + tlb_offset + 8); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, tlb_offset + 8, readq, (!(val & DMA_TLB_IVT)), val); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); /* check IOTLB invalidation granularity */ if (DMA_TLB_IAIG(val) == 0) pr_err("Flush IOTLB failed\n"); if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) pr_debug("TLB flush request %Lx, actual %Lx\n", (unsigned long long)DMA_TLB_IIRG(type), (unsigned long long)DMA_TLB_IAIG(val)); } static struct device_domain_info * domain_lookup_dev_info(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) { struct device_domain_info *info; unsigned long flags; spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(info, &domain->devices, link) { if (info->iommu == iommu && info->bus == bus && info->devfn == devfn) { spin_unlock_irqrestore(&domain->lock, flags); return info; } } spin_unlock_irqrestore(&domain->lock, flags); return NULL; } /* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() * check because it applies only to the built-in QAT devices and it doesn't * grant additional privileges. */ #define BUGGY_QAT_DEVID_MASK 0x4940 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev) { if (pdev->vendor != PCI_VENDOR_ID_INTEL) return false; if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK) return false; return true; } static void iommu_enable_pci_ats(struct device_domain_info *info) { struct pci_dev *pdev; if (!info->ats_supported) return; pdev = to_pci_dev(info->dev); if (!pci_ats_page_aligned(pdev)) return; if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; } static void iommu_disable_pci_ats(struct device_domain_info *info) { if (!info->ats_enabled) return; pci_disable_ats(to_pci_dev(info->dev)); info->ats_enabled = 0; } static void iommu_enable_pci_pri(struct device_domain_info *info) { struct pci_dev *pdev; if (!info->ats_enabled || !info->pri_supported) return; pdev = to_pci_dev(info->dev); /* PASID is required in PRG Response Message. */ if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev)) return; if (pci_reset_pri(pdev)) return; if (!pci_enable_pri(pdev, PRQ_DEPTH)) info->pri_enabled = 1; } static void iommu_disable_pci_pri(struct device_domain_info *info) { if (!info->pri_enabled) return; if (WARN_ON(info->iopf_refcount)) iopf_queue_remove_device(info->iommu->iopf_queue, info->dev); pci_disable_pri(to_pci_dev(info->dev)); info->pri_enabled = 0; } static void intel_flush_iotlb_all(struct iommu_domain *domain) { cache_tag_flush_all(to_dmar_domain(domain)); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) { u32 pmen; unsigned long flags; if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap)) return; raw_spin_lock_irqsave(&iommu->register_lock, flags); pmen = readl(iommu->reg + DMAR_PMEN_REG); pmen &= ~DMA_PMEN_EPM; writel(pmen, iommu->reg + DMAR_PMEN_REG); /* wait for the protected region status bit to clear */ IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, readl, !(pmen & DMA_PMEN_PRS), pmen); raw_spin_unlock_irqrestore(&iommu->register_lock, flags); } static void iommu_enable_translation(struct intel_iommu *iommu) { u32 sts; unsigned long flags; raw_spin_lock_irqsave(&iommu->register_lock, flags); iommu->gcmd |= DMA_GCMD_TE; writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_TES), sts); raw_spin_unlock_irqrestore(&iommu->register_lock, flags); } static void iommu_disable_translation(struct intel_iommu *iommu) { u32 sts; unsigned long flag; if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated && (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap))) return; raw_spin_lock_irqsave(&iommu->register_lock, flag); iommu->gcmd &= ~DMA_GCMD_TE; writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (!(sts & DMA_GSTS_TES)), sts); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } static void disable_dmar_iommu(struct intel_iommu *iommu) { /* * All iommu domains must have been detached from the devices, * hence there should be no domain IDs in use. */ if (WARN_ON(!ida_is_empty(&iommu->domain_ida))) return; if (iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); } static void free_dmar_iommu(struct intel_iommu *iommu) { if (iommu->copied_tables) { bitmap_free(iommu->copied_tables); iommu->copied_tables = NULL; } /* free context mapping */ free_context_table(iommu); if (ecap_prs(iommu->ecap)) intel_iommu_finish_prq(iommu); } /* * Check and return whether first level is used by default for * DMA translation. */ static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) return ecap_flts(iommu->ecap); return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; int num, ret = -ENOSPC; if (domain->domain.type == IOMMU_DOMAIN_SVA) return 0; info = kzalloc_obj(*info); if (!info) return -ENOMEM; guard(mutex)(&iommu->did_lock); curr = xa_load(&domain->iommu_array, iommu->seq_id); if (curr) { curr->refcnt++; kfree(info); return 0; } num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, cap_ndoms(iommu->cap) - 1, GFP_KERNEL); if (num < 0) { pr_err("%s: No free domain ids\n", iommu->name); goto err_unlock; } info->refcnt = 1; info->did = num; info->iommu = iommu; curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id, NULL, info, GFP_KERNEL); if (curr) { ret = xa_err(curr) ? : -EBUSY; goto err_clear; } return 0; err_clear: ida_free(&iommu->domain_ida, info->did); err_unlock: kfree(info); return ret; } void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; if (domain->domain.type == IOMMU_DOMAIN_SVA) return; guard(mutex)(&iommu->did_lock); info = xa_load(&domain->iommu_array, iommu->seq_id); if (--info->refcnt == 0) { ida_free(&iommu->domain_ida, info->did); xa_erase(&domain->iommu_array, iommu->seq_id); kfree(info); } } /* * For kdump cases, old valid entries may be cached due to the * in-flight DMA and copied pgtable, but there is no unmapping * behaviour for them, thus we need an explicit cache flush for * the newly-mapped device. For kdump, at this point, the device * is supposed to finish reset at its driver probe stage, so no * in-flight DMA will exist, and we don't need to worry anymore * hereafter. */ static void copied_context_tear_down(struct intel_iommu *iommu, struct context_entry *context, u8 bus, u8 devfn) { u16 did_old; if (!context_copied(iommu, bus, devfn)) return; assert_spin_locked(&iommu->lock); did_old = context_domain_id(context); context_clear_entry(context); if (did_old < cap_ndoms(iommu->cap)) { iommu->flush.flush_context(iommu, did_old, PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did_old, 0, 0, DMA_TLB_DSI_FLUSH); } clear_context_copied(iommu, bus, devfn); } /* * It's a non-present to present mapping. If hardware doesn't cache * non-present entry we only need to flush the write-buffer. If the * _does_ cache non-present entries, then it does so in the special * domain #0, which we have to flush: */ static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, u8 bus, u8 devfn) { if (cap_caching_mode(iommu->cap)) { iommu->flush.flush_context(iommu, 0, PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); } else { iommu_flush_write_buffer(iommu); } } static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) { struct device_domain_info *info = domain_lookup_dev_info(domain, iommu, bus, devfn); u16 did = domain_id_iommu(domain, iommu); int translation = CONTEXT_TT_MULTI_LEVEL; struct pt_iommu_vtdss_hw_info pt_info; struct context_entry *context; int ret; if (WARN_ON(!intel_domain_is_ss_paging(domain))) return -EINVAL; pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); spin_lock(&iommu->lock); ret = -ENOMEM; context = iommu_context_addr(iommu, bus, devfn, 1); if (!context) goto out_unlock; ret = 0; if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); context_set_domain_id(context, did); if (info && info->ats_supported) translation = CONTEXT_TT_DEV_IOTLB; else translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, pt_info.ssptptr); context_set_address_width(context, pt_info.aw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); context_present_cache_flush(iommu, did, bus, devfn); ret = 0; out_unlock: spin_unlock(&iommu->lock); return ret; } static int domain_context_mapping_cb(struct pci_dev *pdev, u16 alias, void *opaque) { struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev); struct intel_iommu *iommu = info->iommu; struct dmar_domain *domain = opaque; return domain_context_mapping_one(domain, iommu, PCI_BUS_NUM(alias), alias & 0xff); } static int domain_context_mapping(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; int ret; if (!dev_is_pci(dev)) return domain_context_mapping_one(domain, iommu, bus, devfn); ret = pci_for_each_dma_alias(to_pci_dev(dev), domain_context_mapping_cb, domain); if (ret) return ret; iommu_enable_pci_ats(info); return 0; } static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) { struct intel_iommu *iommu = info->iommu; struct context_entry *context; u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); if (!context) { spin_unlock(&iommu->lock); return; } did = context_domain_id(context); context_clear_present(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); intel_context_flush_no_pasid(info, context, did); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); } int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, u16 did, phys_addr_t fsptptr, int flags, struct iommu_domain *old) { if (old) intel_pasid_tear_down_entry(iommu, dev, pasid, false); return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, did, flags); } static int domain_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { if (old) intel_pasid_tear_down_entry(iommu, dev, pasid, false); return intel_pasid_setup_second_level(iommu, domain, dev, pasid); } static int domain_setup_passthrough(struct intel_iommu *iommu, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { if (old) intel_pasid_tear_down_entry(iommu, dev, pasid, false); return intel_pasid_setup_pass_through(iommu, dev, pasid); } static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid, struct iommu_domain *old) { struct pt_iommu_x86_64_hw_info pt_info; unsigned int flags = 0; pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) return -EINVAL; if (pt_info.levels == 5) flags |= PASID_FLAG_FL5LP; if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; if (!(domain->fspt.x86_64_pt.common.features & BIT(PT_FEAT_DMA_INCOHERENT))) flags |= PASID_FLAG_PWSNP; return __domain_setup_first_level(iommu, dev, pasid, domain_id_iommu(domain, iommu), pt_info.gcr3_pt, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; unsigned long flags; int ret; ret = domain_attach_iommu(domain, iommu); if (ret) return ret; info->domain = domain; info->domain_attached = true; spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); spin_unlock_irqrestore(&domain->lock, flags); if (dev_is_real_dma_subdevice(dev)) return 0; if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); else if (intel_domain_is_fs_paging(domain)) ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID, NULL); else if (intel_domain_is_ss_paging(domain)) ret = domain_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID, NULL); else if (WARN_ON(true)) ret = -EINVAL; if (ret) goto out_block_translation; ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) goto out_block_translation; return 0; out_block_translation: device_block_translation(dev); return ret; } /** * device_rmrr_is_relaxable - Test whether the RMRR of this device * is relaxable (ie. is allowed to be not enforced under some conditions) * @dev: device handle * * We assume that PCI USB devices with RMRRs have them largely * for historical reasons and that the RMRR space is not actively used post * boot. This exclusion may change if vendors begin to abuse it. * * The same exception is made for graphics devices, with the requirement that * any use of the RMRR regions will be torn down before assigning the device * to a guest. * * Return: true if the RMRR is relaxable, false otherwise */ static bool device_rmrr_is_relaxable(struct device *dev) { struct pci_dev *pdev; if (!dev_is_pci(dev)) return false; pdev = to_pci_dev(dev); if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) return true; else return false; } static int device_def_domain_type(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; /* * Hardware does not support the passthrough translation mode. * Always use a dynamaic mapping domain. */ if (!ecap_pass_through(iommu->ecap)) return IOMMU_DOMAIN_DMA; if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) return IOMMU_DOMAIN_IDENTITY; } return 0; } static void intel_iommu_init_qi(struct intel_iommu *iommu) { /* * Start from the sane iommu hardware state. * If the queued invalidation is already initialized by us * (for example, while enabling interrupt-remapping) then * we got the things already rolling from a sane state. */ if (!iommu->qi) { /* * Clear any previous faults. */ dmar_fault(-1, iommu); /* * Disable queued invalidation if supported and already enabled * before OS handover. */ dmar_disable_qi(iommu); } if (dmar_enable_qi(iommu)) { /* * Queued Invalidate not enabled, use Register Based Invalidate */ iommu->flush.flush_context = __iommu_flush_context; iommu->flush.flush_iotlb = __iommu_flush_iotlb; pr_info("%s: Using Register based invalidation\n", iommu->name); } else { iommu->flush.flush_context = qi_flush_context; iommu->flush.flush_iotlb = qi_flush_iotlb; pr_info("%s: Using Queued invalidation\n", iommu->name); } } static int copy_context_table(struct intel_iommu *iommu, struct root_entry *old_re, struct context_entry **tbl, int bus, bool ext) { int tbl_idx, pos = 0, idx, devfn, ret = 0, did; struct context_entry *new_ce = NULL, ce; struct context_entry *old_ce = NULL; struct root_entry re; phys_addr_t old_ce_phys; tbl_idx = ext ? bus * 2 : bus; memcpy(&re, old_re, sizeof(re)); for (devfn = 0; devfn < 256; devfn++) { /* First calculate the correct index */ idx = (ext ? devfn * 2 : devfn) % 256; if (idx == 0) { /* First save what we may have and clean up */ if (new_ce) { tbl[tbl_idx] = new_ce; __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); pos = 1; } if (old_ce) memunmap(old_ce); ret = 0; if (devfn < 0x80) old_ce_phys = root_entry_lctp(&re); else old_ce_phys = root_entry_uctp(&re); if (!old_ce_phys) { if (ext && devfn == 0) { /* No LCTP, try UCTP */ devfn = 0x7f; continue; } else { goto out; } } ret = -ENOMEM; old_ce = memremap(old_ce_phys, PAGE_SIZE, MEMREMAP_WB); if (!old_ce) goto out; new_ce = iommu_alloc_pages_node_sz(iommu->node, GFP_KERNEL, SZ_4K); if (!new_ce) goto out_unmap; ret = 0; } /* Now copy the context entry */ memcpy(&ce, old_ce + idx, sizeof(ce)); if (!context_present(&ce)) continue; did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap)) ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL); set_context_copied(iommu, bus, devfn); new_ce[idx] = ce; } tbl[tbl_idx + pos] = new_ce; __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE); out_unmap: memunmap(old_ce); out: return ret; } static int copy_translation_tables(struct intel_iommu *iommu) { struct context_entry **ctxt_tbls; struct root_entry *old_rt; phys_addr_t old_rt_phys; int ctxt_table_entries; u64 rtaddr_reg; int bus, ret; bool new_ext, ext; rtaddr_reg = readq(iommu->reg + DMAR_RTADDR_REG); ext = !!(rtaddr_reg & DMA_RTADDR_SMT); new_ext = !!sm_supported(iommu); /* * The RTT bit can only be changed when translation is disabled, * but disabling translation means to open a window for data * corruption. So bail out and don't copy anything if we would * have to change the bit. */ if (new_ext != ext) return -EINVAL; iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); if (!iommu->copied_tables) return -ENOMEM; old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; if (!old_rt_phys) return -EINVAL; old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB); if (!old_rt) return -ENOMEM; /* This is too big for the stack - allocate it from slab */ ctxt_table_entries = ext ? 512 : 256; ret = -ENOMEM; ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL); if (!ctxt_tbls) goto out_unmap; for (bus = 0; bus < 256; bus++) { ret = copy_context_table(iommu, &old_rt[bus], ctxt_tbls, bus, ext); if (ret) { pr_err("%s: Failed to copy context table for bus %d\n", iommu->name, bus); continue; } } spin_lock(&iommu->lock); /* Context tables are copied, now write them to the root_entry table */ for (bus = 0; bus < 256; bus++) { int idx = ext ? bus * 2 : bus; u64 val; if (ctxt_tbls[idx]) { val = virt_to_phys(ctxt_tbls[idx]) | 1; iommu->root_entry[bus].lo = val; } if (!ext || !ctxt_tbls[idx + 1]) continue; val = virt_to_phys(ctxt_tbls[idx + 1]) | 1; iommu->root_entry[bus].hi = val; } spin_unlock(&iommu->lock); kfree(ctxt_tbls); __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE); ret = 0; out_unmap: memunmap(old_rt); return ret; } static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; int ret; for_each_iommu(iommu, drhd) { if (drhd->ignored) { iommu_disable_translation(iommu); continue; } /* * Find the max pasid size of all IOMMU's in the system. * We need to ensure the system pasid table is no bigger * than the smallest supported. */ if (pasid_supported(iommu)) { u32 temp = 2 << ecap_pss(iommu->ecap); intel_pasid_max_id = min_t(u32, temp, intel_pasid_max_id); } intel_iommu_init_qi(iommu); init_translation_status(iommu); if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { iommu_disable_translation(iommu); clear_translation_pre_enabled(iommu); pr_warn("Translation was enabled for %s but we are not in kdump mode\n", iommu->name); } /* * TBD: * we could share the same root & context tables * among all IOMMU's. Need to Split it later. */ ret = iommu_alloc_root_entry(iommu); if (ret) goto free_iommu; if (translation_pre_enabled(iommu)) { pr_info("Translation already enabled - trying to copy translation structures\n"); ret = copy_translation_tables(iommu); if (ret) { /* * We found the IOMMU with translation * enabled - but failed to copy over the * old root-entry table. Try to proceed * by disabling translation now and * allocating a clean root-entry table. * This might cause DMAR faults, but * probably the dump will still succeed. */ pr_err("Failed to copy translation tables from previous kernel for %s\n", iommu->name); iommu_disable_translation(iommu); clear_translation_pre_enabled(iommu); } else { pr_info("Copied translation tables from previous kernel for %s\n", iommu->name); } } intel_svm_check(iommu); } /* * Now that qi is enabled on all iommus, set the root entry and flush * caches. This is required on some Intel X58 chipsets, otherwise the * flush_context function will loop forever and the boot hangs. */ for_each_active_iommu(iommu, drhd) { iommu_flush_write_buffer(iommu); iommu_set_root_entry(iommu); } check_tylersburg_isoch(); /* * for each drhd * enable fault log * global invalidate context cache * global invalidate iotlb * enable translation */ for_each_iommu(iommu, drhd) { if (drhd->ignored) { /* * we always have to disable PMRs or DMA may fail on * this device */ if (force_on) iommu_disable_protect_mem_regions(iommu); continue; } iommu_flush_write_buffer(iommu); if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; } return 0; free_iommu: for_each_active_iommu(iommu, drhd) { disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } return ret; } static void __init init_no_remapping_devices(void) { struct dmar_drhd_unit *drhd; struct device *dev; int i; for_each_drhd_unit(drhd) { if (!drhd->include_all) { for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) break; /* ignore DMAR unit if no devices exist */ if (i == drhd->devices_cnt) drhd->ignored = 1; } } for_each_active_drhd_unit(drhd) { if (drhd->include_all) continue; for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev))) break; if (i < drhd->devices_cnt) continue; /* This IOMMU has *only* gfx devices. Either bypass it or set the gfx_mapped flag, as appropriate */ drhd->gfx_dedicated = 1; if (disable_igfx_iommu) drhd->ignored = 1; } } #ifdef CONFIG_SUSPEND static int init_iommu_hw(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; int ret; for_each_active_iommu(iommu, drhd) { if (iommu->qi) { ret = dmar_reenable_qi(iommu); if (ret) return ret; } } for_each_iommu(iommu, drhd) { if (drhd->ignored) { /* * we always have to disable PMRs or DMA may fail on * this device */ if (force_on) iommu_disable_protect_mem_regions(iommu); continue; } iommu_flush_write_buffer(iommu); iommu_set_root_entry(iommu); iommu_enable_translation(iommu); iommu_disable_protect_mem_regions(iommu); } return 0; } static void iommu_flush_all(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; for_each_active_iommu(iommu, drhd) { iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } } static int iommu_suspend(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; unsigned long flag; iommu_flush_all(); for_each_active_iommu(iommu, drhd) { iommu_disable_translation(iommu); raw_spin_lock_irqsave(&iommu->register_lock, flag); iommu->iommu_state[SR_DMAR_FECTL_REG] = readl(iommu->reg + DMAR_FECTL_REG); iommu->iommu_state[SR_DMAR_FEDATA_REG] = readl(iommu->reg + DMAR_FEDATA_REG); iommu->iommu_state[SR_DMAR_FEADDR_REG] = readl(iommu->reg + DMAR_FEADDR_REG); iommu->iommu_state[SR_DMAR_FEUADDR_REG] = readl(iommu->reg + DMAR_FEUADDR_REG); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } return 0; } static void iommu_resume(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; unsigned long flag; if (init_iommu_hw()) { if (force_on) panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); else WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); return; } for_each_active_iommu(iommu, drhd) { raw_spin_lock_irqsave(&iommu->register_lock, flag); writel(iommu->iommu_state[SR_DMAR_FECTL_REG], iommu->reg + DMAR_FECTL_REG); writel(iommu->iommu_state[SR_DMAR_FEDATA_REG], iommu->reg + DMAR_FEDATA_REG); writel(iommu->iommu_state[SR_DMAR_FEADDR_REG], iommu->reg + DMAR_FEADDR_REG); writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG], iommu->reg + DMAR_FEUADDR_REG); raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } } static const struct syscore_ops iommu_syscore_ops = { .resume = iommu_resume, .suspend = iommu_suspend, }; static struct syscore iommu_syscore = { .ops = &iommu_syscore_ops, }; static void __init init_iommu_pm_ops(void) { register_syscore(&iommu_syscore); } #else static inline void init_iommu_pm_ops(void) {} #endif /* CONFIG_PM */ static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) || !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) || rmrr->end_address <= rmrr->base_address || arch_rmrr_sanity_check(rmrr)) return -EINVAL; return 0; } int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg) { struct acpi_dmar_reserved_memory *rmrr; struct dmar_rmrr_unit *rmrru; rmrr = (struct acpi_dmar_reserved_memory *)header; if (rmrr_sanity_check(rmrr)) { pr_warn(FW_BUG "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n" "BIOS vendor: %s; Ver: %s; Product Version: %s\n", rmrr->base_address, rmrr->end_address, dmi_get_system_info(DMI_BIOS_VENDOR), dmi_get_system_info(DMI_BIOS_VERSION), dmi_get_system_info(DMI_PRODUCT_VERSION)); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); } rmrru = kzalloc_obj(*rmrru); if (!rmrru) goto out; rmrru->hdr = header; rmrru->base_address = rmrr->base_address; rmrru->end_address = rmrr->end_address; rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1), ((void *)rmrr) + rmrr->header.length, &rmrru->devices_cnt); if (rmrru->devices_cnt && rmrru->devices == NULL) goto free_rmrru; list_add(&rmrru->list, &dmar_rmrr_units); return 0; free_rmrru: kfree(rmrru); out: return -ENOMEM; } static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr) { struct dmar_atsr_unit *atsru; struct acpi_dmar_atsr *tmp; list_for_each_entry_rcu(atsru, &dmar_atsr_units, list, dmar_rcu_check()) { tmp = (struct acpi_dmar_atsr *)atsru->hdr; if (atsr->segment != tmp->segment) continue; if (atsr->header.length != tmp->header.length) continue; if (memcmp(atsr, tmp, atsr->header.length) == 0) return atsru; } return NULL; } int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) { struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; atsr = container_of(hdr, struct acpi_dmar_atsr, header); atsru = dmar_find_atsr(atsr); if (atsru) return 0; atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL); if (!atsru) return -ENOMEM; /* * If memory is allocated from slab by ACPI _DSM method, we need to * copy the memory content because the memory buffer will be freed * on return. */ atsru->hdr = (void *)(atsru + 1); memcpy(atsru->hdr, hdr, hdr->length); atsru->include_all = atsr->flags & 0x1; if (!atsru->include_all) { atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1), (void *)atsr + atsr->header.length, &atsru->devices_cnt); if (atsru->devices_cnt && atsru->devices == NULL) { kfree(atsru); return -ENOMEM; } } list_add_rcu(&atsru->list, &dmar_atsr_units); return 0; } static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru) { dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt); kfree(atsru); } int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg) { struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; atsr = container_of(hdr, struct acpi_dmar_atsr, header); atsru = dmar_find_atsr(atsr); if (atsru) { list_del_rcu(&atsru->list); synchronize_rcu(); intel_iommu_free_atsr(atsru); } return 0; } int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg) { int i; struct device *dev; struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; atsr = container_of(hdr, struct acpi_dmar_atsr, header); atsru = dmar_find_atsr(atsr); if (!atsru) return 0; if (!atsru->include_all && atsru->devices && atsru->devices_cnt) { for_each_active_dev_scope(atsru->devices, atsru->devices_cnt, i, dev) return -EBUSY; } return 0; } static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc) { struct dmar_satc_unit *satcu; struct acpi_dmar_satc *tmp; list_for_each_entry_rcu(satcu, &dmar_satc_units, list, dmar_rcu_check()) { tmp = (struct acpi_dmar_satc *)satcu->hdr; if (satc->segment != tmp->segment) continue; if (satc->header.length != tmp->header.length) continue; if (memcmp(satc, tmp, satc->header.length) == 0) return satcu; } return NULL; } int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) { struct acpi_dmar_satc *satc; struct dmar_satc_unit *satcu; if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; satc = container_of(hdr, struct acpi_dmar_satc, header); satcu = dmar_find_satc(satc); if (satcu) return 0; satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL); if (!satcu) return -ENOMEM; satcu->hdr = (void *)(satcu + 1); memcpy(satcu->hdr, hdr, hdr->length); satcu->atc_required = satc->flags & 0x1; satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1), (void *)satc + satc->header.length, &satcu->devices_cnt); if (satcu->devices_cnt && !satcu->devices) { kfree(satcu); return -ENOMEM; } list_add_rcu(&satcu->list, &dmar_satc_units); return 0; } static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { struct intel_iommu *iommu = dmaru->iommu; int ret; /* * Disable translation if already enabled prior to OS handover. */ if (iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); ret = iommu_alloc_root_entry(iommu); if (ret) goto out; intel_svm_check(iommu); if (dmaru->ignored) { /* * we always have to disable PMRs or DMA may fail on this device */ if (force_on) iommu_disable_protect_mem_regions(iommu); return 0; } intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); if (ecap_prs(iommu->ecap)) { ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; iommu_set_root_entry(iommu); iommu_enable_translation(iommu); iommu_disable_protect_mem_regions(iommu); return 0; disable_iommu: disable_dmar_iommu(iommu); out: free_dmar_iommu(iommu); return ret; } int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) { int ret = 0; struct intel_iommu *iommu = dmaru->iommu; if (!intel_iommu_enabled) return 0; if (iommu == NULL) return -EINVAL; if (insert) { ret = intel_iommu_add(dmaru); } else { disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } return ret; } static void intel_iommu_free_dmars(void) { struct dmar_rmrr_unit *rmrru, *rmrr_n; struct dmar_atsr_unit *atsru, *atsr_n; struct dmar_satc_unit *satcu, *satc_n; list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) { list_del(&rmrru->list); dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt); kfree(rmrru); } list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) { list_del(&atsru->list); intel_iommu_free_atsr(atsru); } list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) { list_del(&satcu->list); dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt); kfree(satcu); } } static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev) { struct dmar_satc_unit *satcu; struct acpi_dmar_satc *satc; struct device *tmp; int i; rcu_read_lock(); list_for_each_entry_rcu(satcu, &dmar_satc_units, list) { satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); if (satc->segment != pci_domain_nr(dev->bus)) continue; for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp) if (to_pci_dev(tmp) == dev) goto out; } satcu = NULL; out: rcu_read_unlock(); return satcu; } static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu) { struct pci_dev *bridge = NULL; struct dmar_atsr_unit *atsru; struct dmar_satc_unit *satcu; struct acpi_dmar_atsr *atsr; bool supported = true; struct pci_bus *bus; struct device *tmp; int i; dev = pci_physfn(dev); satcu = dmar_find_matched_satc_unit(dev); if (satcu) /* * This device supports ATS as it is in SATC table. * When IOMMU is in legacy mode, enabling ATS is done * automatically by HW for the device that requires * ATS, hence OS should not enable this device ATS * to avoid duplicated TLB invalidation. */ return !(satcu->atc_required && !sm_supported(iommu)); for (bus = dev->bus; bus; bus = bus->parent) { bridge = bus->self; /* If it's an integrated device, allow ATS */ if (!bridge) return true; /* Connected via non-PCIe: no ATS */ if (!pci_is_pcie(bridge) || pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) return false; /* If we found the root port, look it up in the ATSR */ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) break; } rcu_read_lock(); list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) { atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); if (atsr->segment != pci_domain_nr(dev->bus)) continue; for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp) if (tmp == &bridge->dev) goto out; if (atsru->include_all) goto out; } supported = false; out: rcu_read_unlock(); return supported; } int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) { int ret; struct dmar_rmrr_unit *rmrru; struct dmar_atsr_unit *atsru; struct dmar_satc_unit *satcu; struct acpi_dmar_atsr *atsr; struct acpi_dmar_reserved_memory *rmrr; struct acpi_dmar_satc *satc; if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) return 0; list_for_each_entry(rmrru, &dmar_rmrr_units, list) { rmrr = container_of(rmrru->hdr, struct acpi_dmar_reserved_memory, header); if (info->event == BUS_NOTIFY_ADD_DEVICE) { ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1), ((void *)rmrr) + rmrr->header.length, rmrr->segment, rmrru->devices, rmrru->devices_cnt); if (ret < 0) return ret; } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { dmar_remove_dev_scope(info, rmrr->segment, rmrru->devices, rmrru->devices_cnt); } } list_for_each_entry(atsru, &dmar_atsr_units, list) { if (atsru->include_all) continue; atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header); if (info->event == BUS_NOTIFY_ADD_DEVICE) { ret = dmar_insert_dev_scope(info, (void *)(atsr + 1), (void *)atsr + atsr->header.length, atsr->segment, atsru->devices, atsru->devices_cnt); if (ret > 0) break; else if (ret < 0) return ret; } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { if (dmar_remove_dev_scope(info, atsr->segment, atsru->devices, atsru->devices_cnt)) break; } } list_for_each_entry(satcu, &dmar_satc_units, list) { satc = container_of(satcu->hdr, struct acpi_dmar_satc, header); if (info->event == BUS_NOTIFY_ADD_DEVICE) { ret = dmar_insert_dev_scope(info, (void *)(satc + 1), (void *)satc + satc->header.length, satc->segment, satcu->devices, satcu->devices_cnt); if (ret > 0) break; else if (ret < 0) return ret; } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) { if (dmar_remove_dev_scope(info, satc->segment, satcu->devices, satcu->devices_cnt)) break; } } return 0; } static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; struct dmar_drhd_unit *drhd; for_each_iommu(iommu, drhd) iommu_disable_translation(iommu); } void intel_iommu_shutdown(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; if (no_iommu || dmar_disabled) return; /* * All other CPUs were brought down, hotplug interrupts were disabled, * no lock and RCU checking needed anymore */ list_for_each_entry(drhd, &dmar_drhd_units, list) { iommu = drhd->iommu; /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); /* Make sure the IOMMUs are switched off */ iommu_disable_translation(iommu); } } static struct intel_iommu *dev_to_intel_iommu(struct device *dev) { struct iommu_device *iommu_dev = dev_to_iommu_device(dev); return container_of(iommu_dev, struct intel_iommu, iommu); } static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); u32 ver = readl(iommu->reg + DMAR_VER_REG); return sysfs_emit(buf, "%d:%d\n", DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); } static DEVICE_ATTR_RO(version); static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%llx\n", iommu->reg_phys); } static DEVICE_ATTR_RO(address); static ssize_t cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%llx\n", iommu->cap); } static DEVICE_ATTR_RO(cap); static ssize_t ecap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%llx\n", iommu->ecap); } static DEVICE_ATTR_RO(ecap); static ssize_t domains_supported_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap)); } static DEVICE_ATTR_RO(domains_supported); static ssize_t domains_used_show(struct device *dev, struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); unsigned int count = 0; int id; for (id = 0; id < cap_ndoms(iommu->cap); id++) if (ida_exists(&iommu->domain_ida, id)) count++; return sysfs_emit(buf, "%d\n", count); } static DEVICE_ATTR_RO(domains_used); static struct attribute *intel_iommu_attrs[] = { &dev_attr_version.attr, &dev_attr_address.attr, &dev_attr_cap.attr, &dev_attr_ecap.attr, &dev_attr_domains_supported.attr, &dev_attr_domains_used.attr, NULL, }; static struct attribute_group intel_iommu_group = { .name = "intel-iommu", .attrs = intel_iommu_attrs, }; const struct attribute_group *intel_iommu_groups[] = { &intel_iommu_group, NULL, }; static bool has_external_pci(void) { struct pci_dev *pdev = NULL; for_each_pci_dev(pdev) if (pdev->external_facing) { pci_dev_put(pdev); return true; } return false; } static int __init platform_optin_force_iommu(void) { if (!dmar_platform_optin() || no_platform_optin || !has_external_pci()) return 0; if (no_iommu || dmar_disabled) pr_info("Intel-IOMMU force enabled due to platform opt in\n"); /* * If Intel-IOMMU is disabled by default, we will apply identity * map for all devices except those marked as being untrusted. */ if (dmar_disabled) iommu_set_default_passthrough(false); dmar_disabled = 0; no_iommu = 0; return 1; } static int __init probe_acpi_namespace_devices(void) { struct dmar_drhd_unit *drhd; /* To avoid a -Wunused-but-set-variable warning. */ struct intel_iommu *iommu __maybe_unused; struct device *dev; int i, ret = 0; for_each_active_iommu(iommu, drhd) { for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) { struct acpi_device_physical_node *pn; struct acpi_device *adev; if (dev->bus != &acpi_bus_type) continue; up_read(&dmar_global_lock); adev = to_acpi_device(dev); mutex_lock(&adev->physical_node_lock); list_for_each_entry(pn, &adev->physical_node_list, node) { ret = iommu_probe_device(pn->dev); if (ret) break; } mutex_unlock(&adev->physical_node_lock); down_read(&dmar_global_lock); if (ret) return ret; } } return 0; } static __init int tboot_force_iommu(void) { if (!tboot_enabled()) return 0; if (no_iommu || dmar_disabled) pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; no_iommu = 0; return 1; } int __init intel_iommu_init(void) { int ret = -ENODEV; struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; /* * Intel IOMMU is required for a TXT/tboot launch or platform * opt in, so enforce that. */ force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || platform_optin_force_iommu(); down_write(&dmar_global_lock); if (dmar_table_init()) { if (force_on) panic("tboot: Failed to initialize DMAR table\n"); goto out_free_dmar; } if (dmar_dev_scope_init() < 0) { if (force_on) panic("tboot: Failed to initialize DMAR device scope\n"); goto out_free_dmar; } up_write(&dmar_global_lock); /* * The bus notifier takes the dmar_global_lock, so lockdep will * complain later when we register it under the lock. */ dmar_register_bus_notifier(); down_write(&dmar_global_lock); if (!no_iommu) intel_iommu_debugfs_init(); if (no_iommu || dmar_disabled) { /* * We exit the function here to ensure IOMMU's remapping and * mempool aren't setup, which means that the IOMMU's PMRs * won't be disabled via the call to init_dmars(). So disable * it explicitly here. The PMRs were setup by tboot prior to * calling SENTER, but the kernel is expected to reset/tear * down the PMRs. */ if (intel_iommu_tboot_noforce) { for_each_iommu(iommu, drhd) iommu_disable_protect_mem_regions(iommu); } /* * Make sure the IOMMUs are switched off, even when we * boot into a kexec kernel and the previous kernel left * them enabled */ intel_disable_iommus(); goto out_free_dmar; } if (list_empty(&dmar_rmrr_units)) pr_info("No RMRR found\n"); if (list_empty(&dmar_atsr_units)) pr_info("No ATSR found\n"); if (list_empty(&dmar_satc_units)) pr_info("No SATC found\n"); init_no_remapping_devices(); ret = init_dmars(); if (ret) { if (force_on) panic("tboot: Failed to initialize DMARs\n"); pr_err("Initialization failed\n"); goto out_free_dmar; } up_write(&dmar_global_lock); init_iommu_pm_ops(); down_read(&dmar_global_lock); for_each_active_iommu(iommu, drhd) { /* * The flush queue implementation does not perform * page-selective invalidations that are required for efficient * TLB flushes in virtual environments. The benefit of batching * is likely to be much lower than the overhead of synchronizing * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); /* * The iommu device probe is protected by the iommu_probe_device_lock. * Release the dmar_global_lock before entering the device probe path * to avoid unnecessary lock order splat. */ up_read(&dmar_global_lock); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); down_read(&dmar_global_lock); iommu_pmu_register(iommu); } if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); /* Finally, we enable the DMA remapping hardware. */ for_each_iommu(iommu, drhd) { if (!drhd->ignored && !translation_pre_enabled(iommu)) iommu_enable_translation(iommu); iommu_disable_protect_mem_regions(iommu); } up_read(&dmar_global_lock); pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); intel_iommu_enabled = 1; return 0; out_free_dmar: intel_iommu_free_dmars(); up_write(&dmar_global_lock); return ret; } static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) { struct device_domain_info *info = opaque; domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); return 0; } /* * NB - intel-iommu lacks any sort of reference counting for the users of * dependent devices. If multiple endpoints have intersecting dependent * devices, unbinding the driver from any one of them will possibly leave * the others unable to operate. */ static void domain_context_clear(struct device_domain_info *info) { if (!dev_is_pci(info->dev)) { domain_context_clear_one(info, info->bus, info->devfn); return; } pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); iommu_disable_pci_ats(info); } /* * Clear the page table pointer in context or pasid table entries so that * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures. */ void device_block_translation(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; unsigned long flags; /* Device in DMA blocking state. Noting to do. */ if (!info->domain_attached) return; if (info->domain) cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID); if (!dev_is_real_dma_subdevice(dev)) { if (sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, dev, IOMMU_NO_PASID, false); else domain_context_clear(info); } /* Device now in DMA blocking state. */ info->domain_attached = false; if (!info->domain) return; spin_lock_irqsave(&info->domain->lock, flags); list_del(&info->link); spin_unlock_irqrestore(&info->domain->lock, flags); domain_detach_iommu(info->domain, iommu); info->domain = NULL; } static int blocking_domain_attach_dev(struct iommu_domain *domain, struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev); device_block_translation(dev); return 0; } static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old); static struct iommu_domain blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &(const struct iommu_domain_ops) { .attach_dev = blocking_domain_attach_dev, .set_dev_pasid = blocking_domain_set_dev_pasid, } }; static struct dmar_domain *paging_domain_alloc(void) { struct dmar_domain *domain; domain = kzalloc_obj(*domain); if (!domain) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->lock); spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); INIT_LIST_HEAD(&domain->s1_domains); spin_lock_init(&domain->s1_lock); return domain; } static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, unsigned int *top_level) { unsigned int mgaw = cap_mgaw(iommu->cap); /* * Spec 3.6 First-Stage Translation: * * Software must limit addresses to less than the minimum of MGAW * and the lower canonical address width implied by FSPM (i.e., * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). */ if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { *top_level = 4; return min(57, mgaw); } /* Four level is always supported */ *top_level = 3; return min(48, mgaw); } static struct iommu_domain * intel_iommu_domain_alloc_first_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) { struct pt_iommu_x86_64_cfg cfg = {}; struct dmar_domain *dmar_domain; int ret; if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); /* Only SL is available in legacy mode */ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); dmar_domain = paging_domain_alloc(); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_fs(iommu, &cfg.top_level); cfg.common.hw_max_oasz_lg2 = 52; cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE); /* First stage always uses scalable mode */ if (!ecap_smpwc(iommu->ecap)) cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); dmar_domain->iommu.iommu_device = dev; dmar_domain->iommu.nid = dev_to_node(dev); dmar_domain->domain.ops = &intel_fs_paging_domain_ops; /* * iotlb sync for map is only needed for legacy implementations that * explicitly require flushing internal write buffers to ensure memory * coherence. */ if (rwbf_required(iommu)) dmar_domain->iotlb_sync_map = true; ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); if (ret) { kfree(dmar_domain); return ERR_PTR(ret); } if (!cap_fl1gp_support(iommu->cap)) dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; if (!intel_iommu_superpage) dmar_domain->domain.pgsize_bitmap = SZ_4K; return &dmar_domain->domain; } static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, unsigned int *top_level) { unsigned int sagaw = cap_sagaw(iommu->cap); unsigned int mgaw = cap_mgaw(iommu->cap); /* * Find the largest table size that both the mgaw and sagaw support. * This sets the valid range of IOVA and the top starting level. * Some HW may only support a 4 or 5 level walk but must limit IOVA to * 3 levels. */ if (mgaw > 48 && sagaw >= BIT(3)) { *top_level = 4; return min(57, mgaw); } else if (mgaw > 39 && sagaw >= BIT(2)) { *top_level = 3 + ffs(sagaw >> 3); return min(48, mgaw); } else if (mgaw > 30 && sagaw >= BIT(1)) { *top_level = 2 + ffs(sagaw >> 2); return min(39, mgaw); } return 0; } static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { IOMMU_PT_DIRTY_OPS(vtdss), .set_dirty_tracking = intel_iommu_set_dirty_tracking, }; static struct iommu_domain * intel_iommu_domain_alloc_second_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) { struct pt_iommu_vtdss_cfg cfg = {}; struct dmar_domain *dmar_domain; unsigned int sslps; int ret; if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID))) return ERR_PTR(-EOPNOTSUPP); if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !nested_supported(iommu)) || ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); /* Legacy mode always supports second stage */ if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); dmar_domain = paging_domain_alloc(); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); cfg.common.hw_max_oasz_lg2 = 52; cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); /* * Read-only mapping is disallowed on the domain which serves as the * parent in a nested configuration, due to HW errata * (ERRATA_772415_SPR17) */ if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); if (!iommu_paging_structure_coherency(iommu)) cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); dmar_domain->iommu.iommu_device = dev; dmar_domain->iommu.nid = dev_to_node(dev); dmar_domain->domain.ops = &intel_ss_paging_domain_ops; dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); if (ret) { kfree(dmar_domain); return ERR_PTR(ret); } /* Adjust the supported page sizes to HW capability */ sslps = cap_super_page_val(iommu->cap); if (!(sslps & BIT(0))) dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; if (!(sslps & BIT(1))) dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; if (!intel_iommu_superpage) dmar_domain->domain.pgsize_bitmap = SZ_4K; /* * Besides the internal write buffer flush, the caching mode used for * legacy nested translation (which utilizes shadowing page tables) * also requires iotlb sync on map. */ if (rwbf_required(iommu) || cap_caching_mode(iommu->cap)) dmar_domain->iotlb_sync_map = true; return &dmar_domain->domain; } static struct iommu_domain * intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct iommu_domain *domain; if (user_data) return ERR_PTR(-EOPNOTSUPP); /* Prefer first stage if possible by default. */ domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); if (domain != ERR_PTR(-EOPNOTSUPP)) return domain; return intel_iommu_domain_alloc_second_stage(dev, iommu, flags); } static void intel_iommu_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); if (WARN_ON(dmar_domain->nested_parent && !list_empty(&dmar_domain->s1_domains))) return; if (WARN_ON(!list_empty(&dmar_domain->devices))) return; pt_iommu_deinit(&dmar_domain->iommu); kfree(dmar_domain->qi_batch); kfree(dmar_domain); } static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, struct intel_iommu *iommu) { if (WARN_ON(dmar_domain->domain.dirty_ops || dmar_domain->nested_parent)) return -EINVAL; /* Only SL is available in legacy mode */ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) return -EINVAL; if (!ecap_smpwc(iommu->ecap) && !(dmar_domain->fspt.x86_64_pt.common.features & BIT(PT_FEAT_DMA_INCOHERENT))) return -EINVAL; /* Supports the number of table levels */ if (!cap_fl5lp_support(iommu->cap) && dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) return -EINVAL; /* Same page size support */ if (!cap_fl1gp_support(iommu->cap) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) return -EINVAL; /* iotlb sync on map requirement */ if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map) return -EINVAL; return 0; } static int paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, struct intel_iommu *iommu) { unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; unsigned int sslps = cap_super_page_val(iommu->cap); struct pt_iommu_vtdss_hw_info pt_info; pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) return -EINVAL; if (dmar_domain->nested_parent && !nested_supported(iommu)) return -EINVAL; /* Legacy mode always supports second stage */ if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) return -EINVAL; if (!iommu_paging_structure_coherency(iommu) && !(dmar_domain->sspt.vtdss_pt.common.features & BIT(PT_FEAT_DMA_INCOHERENT))) return -EINVAL; /* Address width falls within the capability */ if (cap_mgaw(iommu->cap) < vasz_lg2) return -EINVAL; /* Page table level is supported. */ if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) return -EINVAL; /* Same page size support */ if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) return -EINVAL; if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) return -EINVAL; /* iotlb sync on map requirement */ if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) && !dmar_domain->iotlb_sync_map) return -EINVAL; /* * FIXME this is locked wrong, it needs to be under the * dmar_domain->lock */ if ((dmar_domain->sspt.vtdss_pt.common.features & BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && !ecap_sc_support(iommu->ecap)) return -EINVAL; return 0; } int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int ret = -EINVAL; if (intel_domain_is_fs_paging(dmar_domain)) ret = paging_domain_compatible_first_stage(dmar_domain, iommu); else if (intel_domain_is_ss_paging(dmar_domain)) ret = paging_domain_compatible_second_stage(dmar_domain, iommu); else if (WARN_ON(true)) ret = -EINVAL; if (ret) return ret; if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) return intel_pasid_setup_sm_context(dev); return 0; } static int intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev, struct iommu_domain *old) { int ret; device_block_translation(dev); ret = paging_domain_compatible(domain, dev); if (ret) return ret; ret = iopf_for_domain_set(domain, dev); if (ret) return ret; ret = dmar_domain_attach_device(to_dmar_domain(domain), dev); if (ret) iopf_for_domain_remove(domain, dev); return ret; } static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { cache_tag_flush_range(to_dmar_domain(domain), gather->start, gather->end, iommu_pages_list_empty(&gather->freelist)); iommu_put_pages_list(&gather->freelist); } static bool domain_support_force_snooping(struct dmar_domain *domain) { struct device_domain_info *info; bool support = true; assert_spin_locked(&domain->lock); list_for_each_entry(info, &domain->devices, link) { if (!ecap_sc_support(info->iommu->ecap)) { support = false; break; } } return support; } static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct device_domain_info *info; guard(spinlock_irqsave)(&dmar_domain->lock); if (dmar_domain->force_snooping) return true; if (!domain_support_force_snooping(dmar_domain)) return false; dmar_domain->force_snooping = true; list_for_each_entry(info, &dmar_domain->devices, link) intel_pasid_setup_page_snoop_control(info->iommu, info->dev, IOMMU_NO_PASID); return true; } static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); guard(spinlock_irqsave)(&dmar_domain->lock); if (!domain_support_force_snooping(dmar_domain)) return false; /* * Second level page table supports per-PTE snoop control. The * iommu_map() interface will handle this by setting SNP bit. */ dmar_domain->sspt.vtdss_pt.common.features |= BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); dmar_domain->force_snooping = true; return true; } static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) { struct device_domain_info *info = dev_iommu_priv_get(dev); switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return true; case IOMMU_CAP_PRE_BOOT_PROTECTION: return dmar_platform_optin(); case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); case IOMMU_CAP_PCI_ATS_SUPPORTED: return info->ats_supported; default: return false; } } static struct iommu_device *intel_iommu_probe_device(struct device *dev) { struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL; struct device_domain_info *info; struct intel_iommu *iommu; u8 bus, devfn; int ret; iommu = device_lookup_iommu(dev, &bus, &devfn); if (!iommu || !iommu->iommu.ops) return ERR_PTR(-ENODEV); info = kzalloc_obj(*info); if (!info) return ERR_PTR(-ENOMEM); if (dev_is_real_dma_subdevice(dev)) { info->bus = pdev->bus->number; info->devfn = pdev->devfn; info->segment = pci_domain_nr(pdev->bus); } else { info->bus = bus; info->devfn = devfn; info->segment = iommu->segment; } info->dev = dev; info->iommu = iommu; if (dev_is_pci(dev)) { if (ecap_dev_iotlb_support(iommu->ecap) && pci_ats_supported(pdev) && dmar_ats_supported(pdev, iommu)) { info->ats_supported = 1; info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev); /* * For IOMMU that supports device IOTLB throttling * (DIT), we assign PFSID to the invalidation desc * of a VF such that IOMMU HW can gauge queue depth * at PF level. If DIT is not set, PFSID will be * treated as reserved, which should be set to 0. */ if (ecap_dit(iommu->ecap)) info->pfsid = pci_dev_id(pci_physfn(pdev)); info->ats_qdep = pci_ats_queue_depth(pdev); } if (sm_supported(iommu)) { if (pasid_supported(iommu)) { int features = pci_pasid_features(pdev); if (features >= 0) info->pasid_supported = features | 1; } if (info->ats_supported && ecap_prs(iommu->ecap) && ecap_pds(iommu->ecap) && pci_pri_supported(pdev)) info->pri_supported = 1; } } dev_iommu_priv_set(dev, info); if (pdev && pci_ats_supported(pdev)) { pci_prepare_ats(pdev, VTD_PAGE_SHIFT); ret = device_rbtree_insert(iommu, info); if (ret) goto free; } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { ret = intel_pasid_alloc_table(dev); if (ret) { dev_err(dev, "PASID table allocation failed\n"); goto clear_rbtree; } if (!context_copied(iommu, info->bus, info->devfn)) { ret = intel_pasid_setup_sm_context(dev); if (ret) goto free_table; } } intel_iommu_debugfs_create_dev(info); return &iommu->iommu; free_table: intel_pasid_free_table(dev); clear_rbtree: device_rbtree_remove(info); free: kfree(info); return ERR_PTR(ret); } static void intel_iommu_probe_finalize(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; /* * The PCIe spec, in its wisdom, declares that the behaviour of the * device is undefined if you enable PASID support after ATS support. * So always enable PASID support on devices which have it, even if * we can't yet know if we're ever going to use it. */ if (info->pasid_supported && !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1)) info->pasid_enabled = 1; if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { iommu_enable_pci_ats(info); /* Assign a DEVTLB cache tag to the default domain. */ if (info->ats_enabled && info->domain) { u16 did = domain_id_iommu(info->domain, iommu); if (cache_tag_assign(info->domain, did, dev, IOMMU_NO_PASID, CACHE_TAG_DEVTLB)) iommu_disable_pci_ats(info); } } iommu_enable_pci_pri(info); } static void intel_iommu_release_device(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; iommu_disable_pci_pri(info); iommu_disable_pci_ats(info); if (info->pasid_enabled) { pci_disable_pasid(to_pci_dev(dev)); info->pasid_enabled = 0; } mutex_lock(&iommu->iopf_lock); if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) device_rbtree_remove(info); mutex_unlock(&iommu->iopf_lock); if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && !context_copied(iommu, info->bus, info->devfn)) intel_pasid_teardown_sm_context(dev); intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); kfree(info); } static void intel_iommu_get_resv_regions(struct device *device, struct list_head *head) { int prot = DMA_PTE_READ | DMA_PTE_WRITE; struct iommu_resv_region *reg; struct dmar_rmrr_unit *rmrr; struct device *i_dev; int i; rcu_read_lock(); for_each_rmrr_units(rmrr) { for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, i, i_dev) { struct iommu_resv_region *resv; enum iommu_resv_type type; size_t length; if (i_dev != device && !is_downstream_to_pci_bridge(device, i_dev)) continue; length = rmrr->end_address - rmrr->base_address + 1; type = device_rmrr_is_relaxable(device) ? IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; resv = iommu_alloc_resv_region(rmrr->base_address, length, prot, type, GFP_ATOMIC); if (!resv) break; list_add_tail(&resv->list, head); } } rcu_read_unlock(); #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA if (dev_is_pci(device)) { struct pci_dev *pdev = to_pci_dev(device); if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { reg = iommu_alloc_resv_region(0, 1UL << 24, prot, IOMMU_RESV_DIRECT_RELAXABLE, GFP_KERNEL); if (reg) list_add_tail(&reg->list, head); } } #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */ reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, 0, IOMMU_RESV_MSI, GFP_KERNEL); if (!reg) return; list_add_tail(&reg->list, head); } static struct iommu_group *intel_iommu_device_group(struct device *dev) { if (dev_is_pci(dev)) return pci_device_group(dev); return generic_device_group(dev); } int intel_iommu_enable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; int ret; if (!info->pri_enabled) return -ENODEV; /* pri_enabled is protected by the group mutex. */ iommu_group_mutex_assert(dev); if (info->iopf_refcount) { info->iopf_refcount++; return 0; } ret = iopf_queue_add_device(iommu->iopf_queue, dev); if (ret) return ret; info->iopf_refcount = 1; return 0; } void intel_iommu_disable_iopf(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; if (WARN_ON(!info->pri_enabled || !info->iopf_refcount)) return; iommu_group_mutex_assert(dev); if (--info->iopf_refcount) return; iopf_queue_remove_device(iommu->iopf_queue, dev); } static bool intel_iommu_is_attach_deferred(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); return translation_pre_enabled(info->iommu) && !info->domain; } /* * Check that the device does not live on an external facing PCI port that is * marked as untrusted. Such devices should not be able to apply quirks and * thus not be able to bypass the IOMMU restrictions. */ static bool risky_device(struct pci_dev *pdev) { if (pdev->untrusted) { pci_info(pdev, "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n", pdev->vendor, pdev->device); pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n"); return true; } return false; } static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); if (dmar_domain->iotlb_sync_map) cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1); return 0; } void domain_remove_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; unsigned long flags; if (!domain) return; /* Identity domain has no meta data for pasid. */ if (domain->type == IOMMU_DOMAIN_IDENTITY) return; dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { list_del(&curr->link_domain); dev_pasid = curr; break; } } spin_unlock_irqrestore(&dmar_domain->lock, flags); cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); if (!WARN_ON_ONCE(!dev_pasid)) { intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); } } static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); iopf_for_domain_remove(old, dev); domain_remove_dev_pasid(old, dev, pasid); return 0; } struct dev_pasid_info * domain_add_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; unsigned long flags; int ret; dev_pasid = kzalloc_obj(*dev_pasid); if (!dev_pasid) return ERR_PTR(-ENOMEM); ret = domain_attach_iommu(dmar_domain, iommu); if (ret) goto out_free; ret = cache_tag_assign_domain(dmar_domain, dev, pasid); if (ret) goto out_detach_iommu; dev_pasid->dev = dev; dev_pasid->pasid = pasid; spin_lock_irqsave(&dmar_domain->lock, flags); list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); spin_unlock_irqrestore(&dmar_domain->lock, flags); return dev_pasid; out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); out_free: kfree(dev_pasid); return ERR_PTR(ret); } static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct dev_pasid_info *dev_pasid; int ret; if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) return -EINVAL; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; ret = paging_domain_compatible(domain, dev); if (ret) return ret; dev_pasid = domain_add_dev_pasid(domain, dev, pasid); if (IS_ERR(dev_pasid)) return PTR_ERR(dev_pasid); ret = iopf_for_domain_replace(domain, old, dev); if (ret) goto out_remove_dev_pasid; if (intel_domain_is_fs_paging(dmar_domain)) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid, old); else if (intel_domain_is_ss_paging(dmar_domain)) ret = domain_setup_second_level(iommu, dmar_domain, dev, pasid, old); else if (WARN_ON(true)) ret = -EINVAL; if (ret) goto out_unwind_iopf; domain_remove_dev_pasid(old, dev, pasid); intel_iommu_debugfs_create_dev_pasid(dev_pasid); return 0; out_unwind_iopf: iopf_for_domain_replace(old, domain, dev); out_remove_dev_pasid: domain_remove_dev_pasid(domain, dev, pasid); return ret; } static void *intel_iommu_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) return ERR_PTR(-EOPNOTSUPP); vtd = kzalloc_obj(*vtd); if (!vtd) return ERR_PTR(-ENOMEM); vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); *type = IOMMU_HW_INFO_TYPE_INTEL_VTD; return vtd; } /* Set dirty tracking for the devices that the domain has been attached. */ static int domain_set_dirty_tracking(struct dmar_domain *domain, bool enable) { struct device_domain_info *info; struct dev_pasid_info *dev_pasid; int ret = 0; lockdep_assert_held(&domain->lock); list_for_each_entry(info, &domain->devices, link) { ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, IOMMU_NO_PASID, enable); if (ret) return ret; } list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { info = dev_iommu_priv_get(dev_pasid->dev); ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, dev_pasid->pasid, enable); if (ret) break; } return ret; } static int parent_domain_set_dirty_tracking(struct dmar_domain *domain, bool enable) { struct dmar_domain *s1_domain; unsigned long flags; int ret; spin_lock(&domain->s1_lock); list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { spin_lock_irqsave(&s1_domain->lock, flags); ret = domain_set_dirty_tracking(s1_domain, enable); spin_unlock_irqrestore(&s1_domain->lock, flags); if (ret) goto err_unwind; } spin_unlock(&domain->s1_lock); return 0; err_unwind: list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { spin_lock_irqsave(&s1_domain->lock, flags); domain_set_dirty_tracking(s1_domain, domain->dirty_tracking); spin_unlock_irqrestore(&s1_domain->lock, flags); } spin_unlock(&domain->s1_lock); return ret; } static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); int ret; spin_lock(&dmar_domain->lock); if (dmar_domain->dirty_tracking == enable) goto out_unlock; ret = domain_set_dirty_tracking(dmar_domain, enable); if (ret) goto err_unwind; if (dmar_domain->nested_parent) { ret = parent_domain_set_dirty_tracking(dmar_domain, enable); if (ret) goto err_unwind; } dmar_domain->dirty_tracking = enable; out_unlock: spin_unlock(&dmar_domain->lock); return 0; err_unwind: domain_set_dirty_tracking(dmar_domain, dmar_domain->dirty_tracking); spin_unlock(&dmar_domain->lock); return ret; } static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 1); if (!context) { spin_unlock(&iommu->lock); return -ENOMEM; } if (context_present(context) && !context_copied(iommu, bus, devfn)) { spin_unlock(&iommu->lock); return 0; } copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); context_set_domain_id(context, FLPT_DEFAULT_DID); /* * In pass through mode, AW must be programmed to indicate the largest * AGAW value supported by hardware. And ASR is ignored by hardware. */ context_set_address_width(context, iommu->msagaw); context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); spin_unlock(&iommu->lock); return 0; } static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) { struct device *dev = data; return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); } static int device_setup_pass_through(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); if (!dev_is_pci(dev)) return context_setup_pass_through(dev, info->bus, info->devfn); return pci_for_each_dma_alias(to_pci_dev(dev), context_setup_pass_through_cb, dev); } static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; int ret; device_block_translation(dev); if (dev_is_real_dma_subdevice(dev)) return 0; /* * No PRI support with the global identity domain. No need to enable or * disable PRI in this path as the iommu has been put in the blocking * state. */ if (sm_supported(iommu)) ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); else ret = device_setup_pass_through(dev); if (!ret) info->domain_attached = true; return ret; } static int identity_domain_set_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; ret = iopf_for_domain_replace(domain, old, dev); if (ret) return ret; ret = domain_setup_passthrough(iommu, dev, pasid, old); if (ret) { iopf_for_domain_replace(old, domain, dev); return ret; } domain_remove_dev_pasid(old, dev, pasid); return 0; } static struct iommu_domain identity_domain = { .type = IOMMU_DOMAIN_IDENTITY, .ops = &(const struct iommu_domain_ops) { .attach_dev = identity_domain_attach_dev, .set_dev_pasid = identity_domain_set_dev_pasid, }, }; const struct iommu_domain_ops intel_fs_paging_domain_ops = { IOMMU_PT_DOMAIN_OPS(x86_64), .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, }; const struct iommu_domain_ops intel_ss_paging_domain_ops = { IOMMU_PT_DOMAIN_OPS(vtdss), .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, .free = intel_iommu_domain_free, .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, }; const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags, .domain_alloc_sva = intel_svm_domain_alloc, .domain_alloc_nested = intel_iommu_domain_alloc_nested, .probe_device = intel_iommu_probe_device, .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .device_group = intel_iommu_device_group, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .page_response = intel_iommu_page_response, }; static void quirk_iommu_igfx(struct pci_dev *dev) { if (risky_device(dev)) return; pci_info(dev, "Disabling IOMMU for graphics on this chipset\n"); disable_igfx_iommu = 1; } /* G4x/GM45 integrated gfx dmar support is totally busted. */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx); /* QM57/QS57 integrated gfx malfunctions with dmar */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx); /* Broadwell igfx malfunctions with dmar */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx); static void quirk_iommu_rwbf(struct pci_dev *dev) { if (risky_device(dev)) return; /* * Mobile 4 Series Chipset neglects to set RWBF capability, * but needs it. Same seems to hold for the desktop versions. */ pci_info(dev, "Forcing write-buffer flush capability\n"); rwbf_quirk = 1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf); #define GGC 0x52 #define GGC_MEMORY_SIZE_MASK (0xf << 8) #define GGC_MEMORY_SIZE_NONE (0x0 << 8) #define GGC_MEMORY_SIZE_1M (0x1 << 8) #define GGC_MEMORY_SIZE_2M (0x3 << 8) #define GGC_MEMORY_VT_ENABLED (0x8 << 8) #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8) #define GGC_MEMORY_SIZE_3M_VT (0xa << 8) #define GGC_MEMORY_SIZE_4M_VT (0xb << 8) static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) { unsigned short ggc; if (risky_device(dev)) return; if (pci_read_config_word(dev, GGC, &ggc)) return; if (!(ggc & GGC_MEMORY_VT_ENABLED)) { pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n"); disable_igfx_iommu = 1; } else if (!disable_igfx_iommu) { /* we have to ensure the gfx device is idle before we flush */ pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); iommu_set_dma_strict(); } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt); static void quirk_igfx_skip_te_disable(struct pci_dev *dev) { unsigned short ver; if (!IS_GFX_DEVICE(dev)) return; ver = (dev->device >> 8) & 0xff; if (ver != 0x45 && ver != 0x46 && ver != 0x4c && ver != 0x4e && ver != 0x8a && ver != 0x98 && ver != 0x9a && ver != 0xa7 && ver != 0x7d) return; if (risky_device(dev)) return; pci_info(dev, "Skip IOMMU disabling for graphics\n"); iommu_skip_te_disable = 1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable); /* On Tylersburg chipsets, some BIOSes have been known to enable the ISOCH DMAR unit for the Azalia sound device, but not give it any TLB entries, which causes it to deadlock. Check for that. We do this in a function called from init_dmars(), instead of in a PCI quirk, because we don't want to print the obnoxious "BIOS broken" message if VT-d is actually disabled. */ static void __init check_tylersburg_isoch(void) { struct pci_dev *pdev; uint32_t vtisochctrl; /* If there's no Azalia in the system anyway, forget it. */ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL); if (!pdev) return; if (risky_device(pdev)) { pci_dev_put(pdev); return; } pci_dev_put(pdev); /* System Management Registers. Might be hidden, in which case we can't do the sanity check. But that's OK, because the known-broken BIOSes _don't_ actually hide it, so far. */ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL); if (!pdev) return; if (risky_device(pdev)) { pci_dev_put(pdev); return; } if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) { pci_dev_put(pdev); return; } pci_dev_put(pdev); /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */ if (vtisochctrl & 1) return; /* Drop all bits other than the number of TLB entries */ vtisochctrl &= 0x1c; /* If we have the recommended number of TLB entries (16), fine. */ if (vtisochctrl == 0x10) return; /* Zero TLB entries? You get to ride the short bus to school. */ if (!vtisochctrl) { WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n" "BIOS vendor: %s; Ver: %s; Product Version: %s\n", dmi_get_system_info(DMI_BIOS_VENDOR), dmi_get_system_info(DMI_BIOS_VERSION), dmi_get_system_info(DMI_PRODUCT_VERSION)); iommu_identity_mapping |= IDENTMAP_AZALIA; return; } pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n", vtisochctrl); } /* * Here we deal with a device TLB defect where device may inadvertently issue ATS * invalidation completion before posted writes initiated with translated address * that utilized translations matching the invalidation address range, violating * the invalidation completion ordering. * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is * vulnerable to this defect. In other words, any dTLB invalidation initiated not * under the control of the trusted/privileged host device driver must use this * quirk. * Device TLBs are invalidated under the following six conditions: * 1. Device driver does DMA API unmap IOVA * 2. Device driver unbind a PASID from a process, sva_unbind_device() * 3. PASID is torn down, after PASID cache is flushed. e.g. process * exit_mmap() due to crash * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where * VM has to free pages that were unmapped * 5. Userspace driver unmaps a DMA buffer * 6. Cache invalidation in vSVA usage (upcoming) * * For #1 and #2, device drivers are responsible for stopping DMA traffic * before unmap/unbind. For #3, iommu driver gets mmu_notifier to * invalidate TLB the same way as normal user unmap which will use this quirk. * The dTLB invalidation after PASID cache flush does not need this quirk. * * As a reminder, #6 will *NEED* this quirk as we enable nested translation. */ void quirk_extra_dev_tlb_flush(struct device_domain_info *info, unsigned long address, unsigned long mask, u32 pasid, u16 qdep) { u16 sid; if (likely(!info->dtlb_extra_inval)) return; sid = PCI_DEVID(info->bus, info->devfn); if (pasid == IOMMU_NO_PASID) { qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, qdep, address, mask); } else { qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid, pasid, qdep, address, mask); } } #define ecmd_get_status_code(res) (((res) & 0xff) >> 1) /* * Function to submit a command to the enhanced command interface. The * valid enhanced command descriptions are defined in Table 47 of the * VT-d spec. The VT-d hardware implementation may support some but not * all commands, which can be determined by checking the Enhanced * Command Capability Register. * * Return values: * - 0: Command successful without any error; * - Negative: software error value; * - Nonzero positive: failure status code defined in Table 48. */ int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob) { unsigned long flags; u64 res; int ret; if (!cap_ecmds(iommu->cap)) return -ENODEV; raw_spin_lock_irqsave(&iommu->register_lock, flags); res = readq(iommu->reg + DMAR_ECRSP_REG); if (res & DMA_ECMD_ECRSP_IP) { ret = -EBUSY; goto err; } /* * Unconditionally write the operand B, because * - There is no side effect if an ecmd doesn't require an * operand B, but we set the register to some value. * - It's not invoked in any critical path. The extra MMIO * write doesn't bring any performance concerns. */ writeq(ob, iommu->reg + DMAR_ECEO_REG); writeq(ecmd | (oa << DMA_ECMD_OA_SHIFT), iommu->reg + DMAR_ECMD_REG); IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, readq, !(res & DMA_ECMD_ECRSP_IP), res); if (res & DMA_ECMD_ECRSP_IP) { ret = -ETIMEDOUT; goto err; } ret = ecmd_get_status_code(res); err: raw_spin_unlock_irqrestore(&iommu->register_lock, flags); return ret; } MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-only /* * spectrum management * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2018, 2020, 2022-2024, 2026 Intel Corporation */ #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "wme.h" static bool wbcs_elem_to_chandef(const struct ieee80211_wide_bw_chansw_ie *wbcs_elem, struct cfg80211_chan_def *chandef) { u8 ccfs0 = wbcs_elem->new_center_freq_seg0; u8 ccfs1 = wbcs_elem->new_center_freq_seg1; u32 cf0 = ieee80211_channel_to_frequency(ccfs0, chandef->chan->band); u32 cf1 = ieee80211_channel_to_frequency(ccfs1, chandef->chan->band); switch (wbcs_elem->new_channel_width) { case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ chandef->width = NL80211_CHAN_WIDTH_80P80; chandef->center_freq1 = cf0; chandef->center_freq2 = cf1; break; case IEEE80211_VHT_CHANWIDTH_80MHZ: chandef->width = NL80211_CHAN_WIDTH_80; chandef->center_freq1 = cf0; if (ccfs1) { u8 diff = abs(ccfs0 - ccfs1); if (diff == 8) { chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = cf1; } else if (diff > 8) { chandef->width = NL80211_CHAN_WIDTH_80P80; chandef->center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_USE_HT: default: /* If the WBCS Element is present, new channel bandwidth is * at least 40 MHz. */ chandef->width = NL80211_CHAN_WIDTH_40; chandef->center_freq1 = cf0; break; } return cfg80211_chandef_valid(chandef); } static void validate_chandef_by_ht_vht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, u32 vht_cap_info, struct cfg80211_chan_def *chandef) { u32 control_freq, center_freq1, center_freq2; enum nl80211_chan_width chan_width; struct ieee80211_ht_operation ht_oper; struct ieee80211_vht_operation vht_oper; if (conn->mode < IEEE80211_CONN_MODE_HT || conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { chandef->chan = NULL; return; } control_freq = chandef->chan->center_freq; center_freq1 = chandef->center_freq1; center_freq2 = chandef->center_freq2; chan_width = chandef->width; ht_oper.primary_chan = ieee80211_frequency_to_channel(control_freq); if (control_freq != center_freq1) ht_oper.ht_param = control_freq > center_freq1 ? IEEE80211_HT_PARAM_CHA_SEC_BELOW : IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper.ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; ieee80211_chandef_ht_oper(&ht_oper, chandef); if (conn->mode < IEEE80211_CONN_MODE_VHT) return; vht_oper.center_freq_seg0_idx = ieee80211_frequency_to_channel(center_freq1); vht_oper.center_freq_seg1_idx = center_freq2 ? ieee80211_frequency_to_channel(center_freq2) : 0; switch (chan_width) { case NL80211_CHAN_WIDTH_320: WARN_ON(1); break; case NL80211_CHAN_WIDTH_160: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper.center_freq_seg1_idx = vht_oper.center_freq_seg0_idx; vht_oper.center_freq_seg0_idx += control_freq < center_freq1 ? -8 : 8; break; case NL80211_CHAN_WIDTH_80P80: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; default: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } ht_oper.operation_mode = le16_encode_bits(vht_oper.center_freq_seg1_idx, IEEE80211_HT_OP_MODE_CCFS2_MASK); if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &vht_oper, &ht_oper, chandef)) chandef->chan = NULL; } static void validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; u32 control_freq, center_freq1, center_freq2; enum nl80211_chan_width chan_width; DEFINE_RAW_FLEX(struct ieee80211_he_operation, he, optional, sizeof(struct ieee80211_he_6ghz_oper)); struct ieee80211_he_6ghz_oper *_6ghz_oper = (struct ieee80211_he_6ghz_oper *)he->optional; DEFINE_RAW_FLEX(struct ieee80211_eht_operation, eht, optional, sizeof(struct ieee80211_eht_operation_info)); struct ieee80211_eht_operation_info *_oper_info = (struct ieee80211_eht_operation_info *)eht->optional; const struct ieee80211_eht_operation *eht_oper; if (conn->mode < IEEE80211_CONN_MODE_HE) { chandef->chan = NULL; return; } control_freq = chandef->chan->center_freq; center_freq1 = chandef->center_freq1; center_freq2 = chandef->center_freq2; chan_width = chandef->width; he->he_oper_params = le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); _6ghz_oper->primary = ieee80211_frequency_to_channel(control_freq); _6ghz_oper->ccfs0 = ieee80211_frequency_to_channel(center_freq1); _6ghz_oper->ccfs1 = center_freq2 ? ieee80211_frequency_to_channel(center_freq2) : 0; switch (chan_width) { case NL80211_CHAN_WIDTH_320: _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0; _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -16 : 16; _6ghz_oper->control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; break; case NL80211_CHAN_WIDTH_160: _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0; _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -8 : 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } if (conn->mode < IEEE80211_CONN_MODE_EHT) { eht_oper = NULL; } else { eht->params = IEEE80211_EHT_OPER_INFO_PRESENT; _oper_info->control = _6ghz_oper->control; _oper_info->ccfs0 = _6ghz_oper->ccfs0; _oper_info->ccfs1 = _6ghz_oper->ccfs1; eht_oper = eht; } if (!ieee80211_chandef_he_6ghz_oper(local, he, eht_oper, chandef)) chandef->chan = NULL; } int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, struct ieee80211_conn_settings *conn, u8 *bssid, bool unprot_action, struct ieee80211_csa_ie *csa_ie) { enum nl80211_band new_band = current_band; int new_freq; u8 new_chan_no = 0, new_op_class = 0; struct ieee80211_channel *new_chan; struct cfg80211_chan_def new_chandef = {}; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const struct ieee80211_bandwidth_indication *bwi; const struct ieee80211_ext_chansw_ie *ext_chansw_elem; int secondary_channel_offset = -1; memset(csa_ie, 0, sizeof(*csa_ie)); sec_chan_offs = elems->sec_chan_offs; wide_bw_chansw_ie = elems->wide_bw_chansw_ie; bwi = elems->bandwidth_indication; ext_chansw_elem = elems->ext_chansw_ie; if (conn->mode < IEEE80211_CONN_MODE_HT || conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { sec_chan_offs = NULL; wide_bw_chansw_ie = NULL; } if (conn->mode < IEEE80211_CONN_MODE_VHT) wide_bw_chansw_ie = NULL; if (ext_chansw_elem) { new_op_class = ext_chansw_elem->new_operating_class; if (!ieee80211_operating_class_to_band(new_op_class, &new_band)) { new_op_class = 0; if (!unprot_action) sdata_info(sdata, "cannot understand ECSA IE operating class, %d, ignoring\n", ext_chansw_elem->new_operating_class); } else { new_chan_no = ext_chansw_elem->new_ch_num; csa_ie->count = ext_chansw_elem->count; csa_ie->mode = ext_chansw_elem->mode; } } if (!new_op_class && elems->ch_switch_ie) { new_chan_no = elems->ch_switch_ie->new_ch_num; csa_ie->count = elems->ch_switch_ie->count; csa_ie->mode = elems->ch_switch_ie->mode; } /* nothing here we understand */ if (!new_chan_no) return 1; /* Mesh Channel Switch Parameters Element */ if (elems->mesh_chansw_params_ie) { csa_ie->ttl = elems->mesh_chansw_params_ie->mesh_ttl; csa_ie->mode = elems->mesh_chansw_params_ie->mesh_flags; csa_ie->pre_value = le16_to_cpu( elems->mesh_chansw_params_ie->mesh_pre_value); if (elems->mesh_chansw_params_ie->mesh_flags & WLAN_EID_CHAN_SWITCH_PARAM_REASON) csa_ie->reason_code = le16_to_cpu( elems->mesh_chansw_params_ie->mesh_reason); } new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band); new_chan = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); if (!new_chan || new_chan->flags & IEEE80211_CHAN_DISABLED) { if (!unprot_action) sdata_info(sdata, "BSS %pM switches to unsupported channel (%d MHz), disconnecting\n", bssid, new_freq); return -EINVAL; } if (sec_chan_offs) { secondary_channel_offset = sec_chan_offs->sec_chan_offs; } else if (conn->mode >= IEEE80211_CONN_MODE_HT) { /* If the secondary channel offset IE is not present, * we can't know what's the post-CSA offset, so the * best we can do is use 20MHz. */ secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE; } switch (secondary_channel_offset) { default: /* secondary_channel_offset was present but is invalid */ case IEEE80211_HT_PARAM_CHA_SEC_NONE: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT20); break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT40PLUS); break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT40MINUS); break; case -1: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_NO_HT); /* keep width for 5/10 MHz channels */ switch (sdata->vif.bss_conf.chanreq.oper.width) { case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: csa_ie->chanreq.oper.width = sdata->vif.bss_conf.chanreq.oper.width; break; default: break; } break; } /* capture the AP configuration */ csa_ie->chanreq.ap = csa_ie->chanreq.oper; /* parse one of the Elements to build a new chandef */ memset(&new_chandef, 0, sizeof(new_chandef)); new_chandef.chan = new_chan; if (bwi) { /* start with the CSA one */ new_chandef = csa_ie->chanreq.oper; /* and update the width accordingly */ ieee80211_chandef_eht_oper(&bwi->info, &new_chandef); if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT) new_chandef.punctured = get_unaligned_le16(bwi->info.optional); } else if (!wide_bw_chansw_ie || !wbcs_elem_to_chandef(wide_bw_chansw_ie, &new_chandef)) { if (!ieee80211_operating_class_to_chandef(new_op_class, new_chan, &new_chandef)) new_chandef = csa_ie->chanreq.oper; } /* check if the new chandef fits the capabilities */ if (new_band == NL80211_BAND_6GHZ) validate_chandef_by_6ghz_he_eht_oper(sdata, conn, &new_chandef); else validate_chandef_by_ht_vht_oper(sdata, conn, vht_cap_info, &new_chandef); /* if data is there validate the bandwidth & use it */ if (new_chandef.chan) { /* capture the AP chandef before (potential) downgrading */ csa_ie->chanreq.ap = new_chandef; while (conn->bw_limit < ieee80211_min_bw_limit_from_chandef(&new_chandef)) ieee80211_chandef_downgrade(&new_chandef, NULL); if (!cfg80211_chandef_compatible(&new_chandef, &csa_ie->chanreq.oper)) { sdata_info(sdata, "BSS %pM: CSA has inconsistent channel data, disconnecting\n", bssid); return -EINVAL; } csa_ie->chanreq.oper = new_chandef; } if (elems->max_channel_switch_time) csa_ie->max_switch_time = (elems->max_channel_switch_time[0] << 0) | (elems->max_channel_switch_time[1] << 8) | (elems->max_channel_switch_time[2] << 16); return 0; } static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata, struct ieee80211_msrment_ie *request_ie, const u8 *da, const u8 *bssid, u8 dialog_token) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *msr_report; skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(measurement) + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); msr_report = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(measurement)); memcpy(msr_report->da, da, ETH_ALEN); memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN); memcpy(msr_report->bssid, bssid, ETH_ALEN); msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; msr_report->u.action.action_code = WLAN_ACTION_SPCT_MSR_RPRT; msr_report->u.action.measurement.dialog_token = dialog_token; msr_report->u.action.measurement.element_id = WLAN_EID_MEASURE_REPORT; msr_report->u.action.measurement.length = sizeof(struct ieee80211_msrment_ie); msr_report->u.action.measurement.msr_elem.token = request_ie->token; msr_report->u.action.measurement.msr_elem.mode |= IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED; msr_report->u.action.measurement.msr_elem.type = request_ie->type; ieee80211_tx_skb(sdata, skb); } void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { /* * Ignoring measurement request is spec violation. * Mandatory measurements must be reported optional * measurements might be refused or reported incapable * For now just refuse * TODO: Answer basic measurement as unmeasured */ ieee80211_send_refuse_measurement_request(sdata, &mgmt->u.action.measurement.msr_elem, mgmt->sa, mgmt->bssid, mgmt->u.action.measurement.dialog_token); }
548 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bpf_test_run #if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_TEST_RUN_H #include <linux/tracepoint.h> TRACE_EVENT(bpf_trigger_tp, TP_PROTO(int nonce), TP_ARGS(nonce), TP_STRUCT__entry( __field(int, nonce) ), TP_fast_assign( __entry->nonce = nonce; ), TP_printk("nonce %d", __entry->nonce) ); DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), TP_ARGS(err), TP_STRUCT__entry( __field(int, err) ), TP_fast_assign( __entry->err = *err; ), TP_printk("bpf_test_finish with err=%d", __entry->err) ); #ifdef DEFINE_EVENT_WRITABLE #undef BPF_TEST_RUN_DEFINE_EVENT #define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ PARAMS(args), size) #else #undef BPF_TEST_RUN_DEFINE_EVENT #define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) #endif BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish, TP_PROTO(int *err), TP_ARGS(err), sizeof(int) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
42 42 41 40 38 38 38 38 38 38 38 1 36 37 37 37 35 35 3 3 40 42 34 34 33 34 32 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fs.h> #include <linux/fs_struct.h> #include <linux/kernel_read_file.h> #include <linux/security.h> #include <linux/vmalloc.h> /** * kernel_read_file() - read file contents into a kernel buffer * * @file: file to read from * @offset: where to start reading from (see below). * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. * @file_size: if non-NULL, the full size of @file will be * written here. * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL * (i.e. the caller must be expecting to read partial file contents * via an already-allocated @buf, in at most @buf_size chunks, and * will be able to determine when the entire file was read by * checking @file_size). This isn't a recommended way to read a * file, though, since it is possible that the contents might * change between calls to kernel_read_file(). * * Returns number of bytes read (no single read will be bigger * than SSIZE_MAX), or negative on error. * */ ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { loff_t i_size, pos; ssize_t copied; void *allocated = NULL; bool whole_file; int ret; if (offset != 0 && (!*buf || !file_size)) return -EINVAL; if (!S_ISREG(file_inode(file)->i_mode)) return -EINVAL; ret = deny_write_access(file); if (ret) return ret; i_size = i_size_read(file_inode(file)); if (i_size <= 0) { ret = -EINVAL; goto out; } /* The file is too big for sane activities. */ if (i_size > SSIZE_MAX) { ret = -EFBIG; goto out; } /* The entire file cannot be read in one buffer. */ if (!file_size && offset == 0 && i_size > buf_size) { ret = -EFBIG; goto out; } whole_file = (offset == 0 && i_size <= buf_size); ret = security_kernel_read_file(file, id, whole_file); if (ret) goto out; if (file_size) *file_size = i_size; if (!*buf) *buf = allocated = vmalloc(i_size); if (!*buf) { ret = -ENOMEM; goto out; } pos = offset; copied = 0; while (copied < buf_size) { ssize_t bytes; size_t wanted = min_t(size_t, buf_size - copied, i_size - pos); bytes = kernel_read(file, *buf + copied, wanted, &pos); if (bytes < 0) { ret = bytes; goto out_free; } if (bytes == 0) break; copied += bytes; } if (whole_file) { if (pos != i_size) { ret = -EIO; goto out_free; } ret = security_kernel_post_read_file(file, *buf, i_size, id); } out_free: if (ret < 0) { if (allocated) { vfree(*buf); *buf = NULL; } } out: allow_write_access(file); return ret == 0 ? copied : ret; } EXPORT_SYMBOL_GPL(kernel_read_file); ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { struct file *file; ssize_t ret; if (!path || !*path) return -EINVAL; file = filp_open(path, O_RDONLY, 0); if (IS_ERR(file)) return PTR_ERR(file); ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); fput(file); return ret; } EXPORT_SYMBOL_GPL(kernel_read_file_from_path); ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { struct file *file; struct path root; ssize_t ret; if (!path || !*path) return -EINVAL; task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); file = file_open_root(&root, path, O_RDONLY, 0); path_put(&root); if (IS_ERR(file)) return PTR_ERR(file); ret = kernel_read_file(file, offset, buf, buf_size, file_size, id); fput(file); return ret; } EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns); ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { CLASS(fd, f)(fd); if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) return -EBADF; return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
2 17 3 6 4 4 2 2 4 4 7 1 7 7 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_RED_H #define __NET_SCHED_RED_H #include <linux/types.h> #include <linux/bug.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/dsfield.h> #include <linux/reciprocal_div.h> /* Random Early Detection (RED) algorithm. ======================================= Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. This file codes a "divisionless" version of RED algorithm as written down in Fig.17 of the paper. Short description. ------------------ When a new packet arrives we calculate the average queue length: avg = (1-W)*avg + W*current_queue_len, W is the filter time constant (chosen as 2^(-Wlog)), it controls the inertia of the algorithm. To allow larger bursts, W should be decreased. if (avg > th_max) -> packet marked (dropped). if (avg < th_min) -> packet passes. if (th_min < avg < th_max) we calculate probability: Pb = max_P * (avg - th_min)/(th_max-th_min) and mark (drop) packet with this probability. Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). max_P should be small (not 1), usually 0.01..0.02 is good value. max_P is chosen as a number, so that max_P/(th_max-th_min) is a negative power of two in order arithmetic to contain only shifts. Parameters, settable by user: ----------------------------- qth_min - bytes (should be < qth_max/2) qth_max - bytes (should be at least 2*qth_min and less limit) Wlog - bits (<32) log(1/W). Plog - bits (<32) Plog is related to max_P by formula: max_P = (qth_max-qth_min)/2^Plog; F.e. if qth_max=128K and qth_min=32K, then Plog=22 corresponds to max_P=0.02 Scell_log Stab Lookup table for log((1-W)^(t/t_ave). NOTES: Upper bound on W. ----------------- If you want to allow bursts of L packets of size S, you should choose W: L + 1 - th_min/S < (1-(1-W)^L)/W th_min/S = 32 th_min/S = 4 log(W) L -1 33 -2 35 -3 39 -4 46 -5 57 -6 75 -7 101 -8 135 -9 190 etc. */ /* * Adaptative RED : An Algorithm for Increasing the Robustness of RED's AQM * (Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker) August 2001 * * Every 500 ms: * if (avg > target and max_p <= 0.5) * increase max_p : max_p += alpha; * else if (avg < target and max_p >= 0.01) * decrease max_p : max_p *= beta; * * target :[qth_min + 0.4*(qth_min - qth_max), * qth_min + 0.6*(qth_min - qth_max)]. * alpha : min(0.01, max_p / 4) * beta : 0.9 * max_P is a Q0.32 fixed point number (with 32 bits mantissa) * max_P between 0.01 and 0.5 (1% - 50%) [ Its no longer a negative power of two ] */ #define RED_ONE_PERCENT ((u32)DIV_ROUND_CLOSEST(1ULL<<32, 100)) #define MAX_P_MIN (1 * RED_ONE_PERCENT) #define MAX_P_MAX (50 * RED_ONE_PERCENT) #define MAX_P_ALPHA(val) min(MAX_P_MIN, val / 4) #define RED_STAB_SIZE 256 #define RED_STAB_MASK (RED_STAB_SIZE - 1) struct red_stats { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ }; struct red_parms { /* Parameters */ u32 qth_min; /* Min avg length threshold: Wlog scaled */ u32 qth_max; /* Max avg length threshold: Wlog scaled */ u32 Scell_max; u32 max_P; /* probability, [0 .. 1.0] 32 scaled */ /* reciprocal_value(max_P / qth_delta) */ struct reciprocal_value max_P_reciprocal; u32 qth_delta; /* max_th - min_th */ u32 target_min; /* min_th + 0.4*(max_th - min_th) */ u32 target_max; /* min_th + 0.6*(max_th - min_th) */ u8 Scell_log; u8 Wlog; /* log(W) */ u8 Plog; /* random number bits */ u8 Stab[RED_STAB_SIZE]; }; struct red_vars { /* Variables */ int qcount; /* Number of packets since last random number generation */ u32 qR; /* Cached random number */ unsigned long qavg; /* Average queue length: Wlog scaled */ ktime_t qidlestart; /* Start of current idle period */ }; static inline u32 red_maxp(u8 Plog) { return Plog < 32 ? (~0U >> Plog) : ~0U; } static inline void red_set_vars(struct red_vars *v) { /* Reset average queue length, the value is strictly bound * to the parameters below, resetting hurts a bit but leaving * it might result in an unreasonable qavg for a while. --TGR */ v->qavg = 0; v->qcount = -1; } static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log, u8 *stab) { if (fls(qth_min) + Wlog >= 32) return false; if (fls(qth_max) + Wlog >= 32) return false; if (Scell_log >= 32) return false; if (qth_max < qth_min) return false; if (stab) { int i; for (i = 0; i < RED_STAB_SIZE; i++) if (stab[i] >= 32) return false; } return true; } static inline int red_get_flags(unsigned char qopt_flags, unsigned char historic_mask, struct nlattr *flags_attr, unsigned char supported_mask, struct nla_bitfield32 *p_flags, unsigned char *p_userbits, struct netlink_ext_ack *extack) { struct nla_bitfield32 flags; if (qopt_flags && flags_attr) { NL_SET_ERR_MSG_MOD(extack, "flags should be passed either through qopt, or through a dedicated attribute"); return -EINVAL; } if (flags_attr) { flags = nla_get_bitfield32(flags_attr); } else { flags.selector = historic_mask; flags.value = qopt_flags & historic_mask; } *p_flags = flags; *p_userbits = qopt_flags & ~historic_mask; return 0; } static inline int red_validate_flags(unsigned char flags, struct netlink_ext_ack *extack) { if ((flags & TC_RED_NODROP) && !(flags & TC_RED_ECN)) { NL_SET_ERR_MSG_MOD(extack, "nodrop mode is only meaningful with ECN"); return -EINVAL; } return 0; } static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab, u32 max_P) { int delta = qth_max - qth_min; u32 max_p_delta; WRITE_ONCE(p->qth_min, qth_min << Wlog); WRITE_ONCE(p->qth_max, qth_max << Wlog); WRITE_ONCE(p->Wlog, Wlog); WRITE_ONCE(p->Plog, Plog); if (delta <= 0) delta = 1; p->qth_delta = delta; if (!max_P) { max_P = red_maxp(Plog); max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ } WRITE_ONCE(p->max_P, max_P); max_p_delta = max_P / delta; max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); /* RED Adaptative target : * [min_th + 0.4*(min_th - max_th), * min_th + 0.6*(min_th - max_th)]. */ delta /= 5; p->target_min = qth_min + 2*delta; p->target_max = qth_min + 3*delta; WRITE_ONCE(p->Scell_log, Scell_log); p->Scell_max = (255 << Scell_log); if (stab) memcpy(p->Stab, stab, sizeof(p->Stab)); } static inline int red_is_idling(const struct red_vars *v) { return v->qidlestart != 0; } static inline void red_start_of_idle_period(struct red_vars *v) { v->qidlestart = ktime_get(); } static inline void red_end_of_idle_period(struct red_vars *v) { v->qidlestart = 0; } static inline void red_restart(struct red_vars *v) { red_end_of_idle_period(v); v->qavg = 0; v->qcount = -1; } static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms *p, const struct red_vars *v) { s64 delta = ktime_us_delta(ktime_get(), v->qidlestart); long us_idle = min_t(s64, delta, p->Scell_max); int shift; /* * The problem: ideally, average length queue recalculation should * be done over constant clock intervals. This is too expensive, so * that the calculation is driven by outgoing packets. * When the queue is idle we have to model this clock by hand. * * SF+VJ proposed to "generate": * * m = idletime / (average_pkt_size / bandwidth) * * dummy packets as a burst after idle time, i.e. * * v->qavg *= (1-W)^m * * This is an apparently overcomplicated solution (f.e. we have to * precompute a table to make this calculation in reasonable time) * I believe that a simpler model may be used here, * but it is field for experiments. */ shift = p->Stab[(us_idle >> p->Scell_log) & RED_STAB_MASK]; if (shift) return v->qavg >> shift; else { /* Approximate initial part of exponent with linear function: * * (1-W)^m ~= 1-mW + ... * * Seems, it is the best solution to * problem of too coarse exponent tabulation. */ us_idle = (v->qavg * (u64)us_idle) >> p->Scell_log; if (us_idle < (v->qavg >> 1)) return v->qavg - us_idle; else return v->qavg >> 1; } } static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p, const struct red_vars *v, unsigned int backlog) { /* * NOTE: v->qavg is fixed point number with point at Wlog. * The formula below is equivalent to floating point * version: * * qavg = qavg*(1-W) + backlog*W; * * --ANK (980924) */ return v->qavg + (backlog - (v->qavg >> p->Wlog)); } static inline unsigned long red_calc_qavg(const struct red_parms *p, const struct red_vars *v, unsigned int backlog) { if (!red_is_idling(v)) return red_calc_qavg_no_idle_time(p, v, backlog); else return red_calc_qavg_from_idle_time(p, v); } static inline u32 red_random(const struct red_parms *p) { return reciprocal_divide(get_random_u32(), p->max_P_reciprocal); } static inline int red_mark_probability(const struct red_parms *p, const struct red_vars *v, unsigned long qavg) { /* The formula used below causes questions. OK. qR is random number in the interval (0..1/max_P)*(qth_max-qth_min) i.e. 0..(2^Plog). If we used floating point arithmetic, it would be: (2^Plog)*rnd_num, where rnd_num is less 1. Taking into account, that qavg have fixed point at Wlog, two lines below have the following floating point equivalent: max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount Any questions? --ANK (980924) */ return !(((qavg - p->qth_min) >> p->Wlog) * v->qcount < v->qR); } enum { RED_BELOW_MIN_THRESH, RED_BETWEEN_TRESH, RED_ABOVE_MAX_TRESH, }; static inline int red_cmp_thresh(const struct red_parms *p, unsigned long qavg) { if (qavg < p->qth_min) return RED_BELOW_MIN_THRESH; else if (qavg >= p->qth_max) return RED_ABOVE_MAX_TRESH; else return RED_BETWEEN_TRESH; } enum { RED_DONT_MARK, RED_PROB_MARK, RED_HARD_MARK, }; static inline int red_action(const struct red_parms *p, struct red_vars *v, unsigned long qavg) { switch (red_cmp_thresh(p, qavg)) { case RED_BELOW_MIN_THRESH: v->qcount = -1; return RED_DONT_MARK; case RED_BETWEEN_TRESH: if (++v->qcount) { if (red_mark_probability(p, v, qavg)) { v->qcount = 0; v->qR = red_random(p); return RED_PROB_MARK; } } else v->qR = red_random(p); return RED_DONT_MARK; case RED_ABOVE_MAX_TRESH: v->qcount = -1; return RED_HARD_MARK; } BUG(); return RED_DONT_MARK; } static inline void red_adaptative_algo(struct red_parms *p, struct red_vars *v) { unsigned long qavg; u32 max_p_delta; qavg = v->qavg; if (red_is_idling(v)) qavg = red_calc_qavg_from_idle_time(p, v); /* v->qavg is fixed point number with point at Wlog */ qavg >>= p->Wlog; if (qavg > p->target_max && p->max_P <= MAX_P_MAX) p->max_P += MAX_P_ALPHA(p->max_P); /* maxp = maxp + alpha */ else if (qavg < p->target_min && p->max_P >= MAX_P_MIN) p->max_P = (p->max_P/10)*9; /* maxp = maxp * Beta */ max_p_delta = DIV_ROUND_CLOSEST(p->max_P, p->qth_delta); max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); } #endif
201 50 50 50 201 201 201 200 201 200 201 45 45 44 45 45 45 45 198 198 198 198 197 198 198 197 45 45 22 21 22 25 25 25 25 25 25 25 25 45 45 12 12 12 12 7 12 41 41 41 41 4 45 45 45 45 12 45 45 45 41 45 45 45 45 45 45 22 16 22 12 22 45 44 134 129 17 129 127 127 17 134 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 // SPDX-License-Identifier: GPL-2.0-only /* * The Virtio 9p transport driver * * This is a block based transport driver based on the lguest block driver * code. * * Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation * * Based on virtio console driver * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/slab.h> #include <net/9p/9p.h> #include <linux/fs_context.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/scatterlist.h> #include <linux/swap.h> #include <linux/virtio.h> #include <linux/virtio_9p.h> #include "trans_common.h" #define VIRTQUEUE_NUM 128 /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); static DECLARE_WAIT_QUEUE_HEAD(vp_wq); static atomic_t vp_pinned = ATOMIC_INIT(0); /** * struct virtio_chan - per-instance transport information * @inuse: whether the channel is in use * @lock: protects multiple elements within this structure * @client: client instance * @vdev: virtio dev associated with this channel * @vq: virtio queue associated with this channel * @ring_bufs_avail: flag to indicate there is some available in the ring buf * @vc_wq: wait queue for waiting for thing to be added to ring buf * @p9_max_pages: maximum number of pinned pages * @sg: scatter gather list which is used to pack a request (protected?) * @chan_list: linked list of channels * * We keep all per-channel information in a structure. * This structure is allocated within the devices dev->mem space. * A pointer to the structure will get put in the transport private. * */ struct virtio_chan { bool inuse; spinlock_t lock; struct p9_client *client; struct virtio_device *vdev; struct virtqueue *vq; int ring_bufs_avail; wait_queue_head_t *vc_wq; /* This is global limit. Since we don't have a global structure, * will be placing it in each channel. */ unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; /** * @tag: name to identify a mount null terminated */ char *tag; struct list_head chan_list; }; static struct list_head virtio_chan_list; /* How many bytes left in this page. */ static unsigned int rest_of_page(void *data) { return PAGE_SIZE - offset_in_page(data); } /** * p9_virtio_close - reclaim resources of a channel * @client: client instance * * This reclaims a channel by freeing its resources and * resetting its inuse flag. * */ static void p9_virtio_close(struct p9_client *client) { struct virtio_chan *chan = client->trans; mutex_lock(&virtio_9p_lock); if (chan) chan->inuse = false; mutex_unlock(&virtio_9p_lock); } /** * req_done - callback which signals activity from the server * @vq: virtio queue activity was received on * * This notifies us that the server has triggered some activity * on the virtio channel - most likely a response to request we * sent. Figure out which requests now have responses and wake up * those threads. * * Bugs: could do with some additional sanity checking, but appears to work. * */ static void req_done(struct virtqueue *vq) { struct virtio_chan *chan = vq->vdev->priv; unsigned int len; struct p9_req_t *req; bool need_wakeup = false; unsigned long flags; p9_debug(P9_DEBUG_TRANS, ": request done\n"); spin_lock_irqsave(&chan->lock, flags); while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) { if (!chan->ring_bufs_avail) { chan->ring_bufs_avail = 1; need_wakeup = true; } if (len) { req->rc.size = len; p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } spin_unlock_irqrestore(&chan->lock, flags); /* Wakeup if anyone waiting for VirtIO ring space. */ if (need_wakeup) wake_up(chan->vc_wq); } /** * pack_sg_list - pack a scatter gather list from a linear buffer * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at * @limit: maximum segment to pack data to * @data: data to pack into scatter/gather list * @count: amount of data to pack into the scatter/gather list * * sg_lists have multiple segments of various sizes. This will pack * arbitrary data into an existing scatter gather list, segmenting the * data as necessary within constraints. * */ static int pack_sg_list(struct scatterlist *sg, int start, int limit, char *data, int count) { int s; int index = start; while (count) { s = rest_of_page(data); if (s > count) s = count; BUG_ON(index >= limit); /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_buf(&sg[index++], data, s); count -= s; data += s; } if (index-start) sg_mark_end(&sg[index - 1]); return index-start; } /* We don't currently allow canceling of virtio requests */ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) { return 1; } /* Reply won't come, so drop req ref */ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) { p9_req_put(client, req); return 0; } /** * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, * this takes a list of pages. * @sg: scatter/gather list to pack into * @start: which segment of the sg_list to start at * @limit: maximum number of pages in sg list. * @pdata: a list of pages to add into sg. * @nr_pages: number of pages to pack into the scatter/gather list * @offs: amount of data in the beginning of first page _not_ to pack * @count: amount of data to pack into the scatter/gather list */ static int pack_sg_list_p(struct scatterlist *sg, int start, int limit, struct page **pdata, int nr_pages, size_t offs, int count) { int i = 0, s; int data_off = offs; int index = start; BUG_ON(nr_pages > (limit - start)); /* * if the first page doesn't start at * page boundary find the offset */ while (nr_pages) { s = PAGE_SIZE - data_off; if (s > count) s = count; BUG_ON(index >= limit); /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); data_off = 0; count -= s; nr_pages--; } if (index-start) sg_mark_end(&sg[index - 1]); return index - start; } /** * p9_virtio_request - issue a request * @client: client instance issuing the request * @req: request to be issued * */ static int p9_virtio_request(struct p9_client *client, struct p9_req_t *req) { int err; int in, out, out_sgs, in_sgs; unsigned long flags; struct virtio_chan *chan = client->trans; struct scatterlist *sgs[2]; p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); WRITE_ONCE(req->status, REQ_STATUS_SENT); req_retry: spin_lock_irqsave(&chan->lock, flags); out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); err = io_wait_event_killable(*chan->vc_wq, chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n"); goto req_retry; } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio rpc add_sgs returned failure\n"); return -EIO; } } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); return 0; } static int p9_get_mapped_pages(struct virtio_chan *chan, struct page ***pages, struct iov_iter *data, int count, size_t *offs, int *need_drop) { int nr_pages; int err; if (!iov_iter_count(data)) return 0; if (!iov_iter_is_kvec(data)) { int n; /* * We allow only p9_max_pages pinned. We wait for the * Other zc request to finish here */ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { err = io_wait_event_killable(vp_wq, (atomic_read(&vp_pinned) < chan->p9_max_pages)); if (err == -ERESTARTSYS) return err; } n = iov_iter_get_pages_alloc2(data, pages, count, offs); if (n < 0) return n; *need_drop = 1; nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); atomic_add(nr_pages, &vp_pinned); return n; } else { /* kernel buffer, no need to pin pages */ int index; size_t len; void *p; /* we'd already checked that it's non-empty */ while (1) { len = iov_iter_single_seg_count(data); if (likely(len)) { p = data->kvec->iov_base + data->iov_offset; break; } iov_iter_advance(data, 0); } if (len > count) len = count; nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - (unsigned long)p / PAGE_SIZE; *pages = kmalloc_objs(struct page *, nr_pages, GFP_NOFS); if (!*pages) return -ENOMEM; *need_drop = 0; p -= (*offs = offset_in_page(p)); for (index = 0; index < nr_pages; index++) { if (is_vmalloc_addr(p)) (*pages)[index] = vmalloc_to_page(p); else (*pages)[index] = kmap_to_page(p); p += PAGE_SIZE; } iov_iter_advance(data, len); return len; } } static void handle_rerror(struct p9_req_t *req, int in_hdr_len, size_t offs, struct page **pages) { unsigned size, n; void *to = req->rc.sdata + in_hdr_len; // Fits entirely into the static data? Nothing to do. if (req->rc.size < in_hdr_len || !pages) return; // Really long error message? Tough, truncate the reply. Might get // rejected (we can't be arsed to adjust the size encoded in header, // or string size for that matter), but it wouldn't be anything valid // anyway. if (unlikely(req->rc.size > P9_ZC_HDR_SZ)) req->rc.size = P9_ZC_HDR_SZ; // data won't span more than two pages size = req->rc.size - in_hdr_len; n = PAGE_SIZE - offs; if (size > n) { memcpy_from_page(to, *pages++, offs, n); offs = 0; to += n; size -= n; } memcpy_from_page(to, *pages, offs, size); } /** * p9_virtio_zc_request - issue a zero copy request * @client: client instance issuing the request * @req: request to be issued * @uidata: user buffer that should be used for zero copy read * @uodata: user buffer that should be used for zero copy write * @inlen: read buffer size * @outlen: write buffer size * @in_hdr_len: reader header size, This is the size of response protocol data * */ static int p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct iov_iter *uidata, struct iov_iter *uodata, int inlen, int outlen, int in_hdr_len) { int in, out, err, out_sgs, in_sgs; unsigned long flags; int in_nr_pages = 0, out_nr_pages = 0; struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; struct scatterlist *sgs[4]; size_t offs = 0; int need_drop = 0; int kicked = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); if (uodata) { __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, outlen, &offs, &need_drop); if (n < 0) { err = n; goto err_out; } out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != outlen) { __le32 v = cpu_to_le32(n); memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); outlen = n; } /* The size field of the message must include the length of the * header and the length of the data. We didn't actually know * the length of the data until this point so add it in now. */ sz = cpu_to_le32(req->tc.size + outlen); memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, inlen, &offs, &need_drop); if (n < 0) { err = n; goto err_out; } in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != inlen) { __le32 v = cpu_to_le32(n); memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); inlen = n; } } WRITE_ONCE(req->status, REQ_STATUS_SENT); req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); out_sgs = in_sgs = 0; /* out data */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; if (out_pages) { sgs[out_sgs++] = chan->sg + out; out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out_pages, out_nr_pages, offs, outlen); } /* * Take care of in data * For example TREAD have 11. * 11 is the read/write header = PDU Header(7) + IO Size (4). * Arrange in such a way that server places header in the * allocated memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; if (in_pages) { sgs[out_sgs + in_sgs++] = chan->sg + out + in; pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in_pages, in_nr_pages, offs, inlen); } BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); err = io_wait_event_killable(*chan->vc_wq, chan->ring_bufs_avail); if (err == -ERESTARTSYS) goto err_out; p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n"); goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio rpc add_sgs returned failure\n"); err = -EIO; goto err_out; } } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); err = io_wait_event_killable(req->wq, READ_ONCE(req->status) >= REQ_STATUS_RCVD); // RERROR needs reply (== error string) in static data if (READ_ONCE(req->status) == REQ_STATUS_RCVD && unlikely(req->rc.sdata[4] == P9_RERROR)) handle_rerror(req, in_hdr_len, offs, in_pages); /* * Non kernel buffers are pinned, unpin them */ err_out: if (need_drop) { if (in_pages) { p9_release_pages(in_pages, in_nr_pages); atomic_sub(in_nr_pages, &vp_pinned); } if (out_pages) { p9_release_pages(out_pages, out_nr_pages); atomic_sub(out_nr_pages, &vp_pinned); } /* wakeup anybody waiting for slots to pin pages */ wake_up(&vp_wq); } kvfree(in_pages); kvfree(out_pages); if (!kicked) { /* reply won't come */ p9_req_put(client, req); } return err; } static ssize_t p9_mount_tag_show(struct device *dev, struct device_attribute *attr, char *buf) { struct virtio_chan *chan; struct virtio_device *vdev; int tag_len; vdev = dev_to_virtio(dev); chan = vdev->priv; tag_len = strlen(chan->tag); memcpy(buf, chan->tag, tag_len + 1); return tag_len + 1; } static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); /** * p9_virtio_probe - probe for existence of 9P virtio channels * @vdev: virtio device to probe * * This probes for existing virtio channels. * */ static int p9_virtio_probe(struct virtio_device *vdev) { __u16 tag_len; char *tag; int err; struct virtio_chan *chan; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } chan = kmalloc_obj(struct virtio_chan); if (!chan) { pr_err("Failed to allocate virtio 9P channel\n"); err = -ENOMEM; goto fail; } chan->vdev = vdev; /* We expect one virtqueue, for requests. */ chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); if (IS_ERR(chan->vq)) { err = PTR_ERR(chan->vq); goto out_free_chan; } chan->vq->vdev->priv = chan; spin_lock_init(&chan->lock); sg_init_table(chan->sg, VIRTQUEUE_NUM); chan->inuse = false; if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) { virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len); } else { err = -EINVAL; goto out_free_vq; } tag = kzalloc(tag_len + 1, GFP_KERNEL); if (!tag) { err = -ENOMEM; goto out_free_vq; } virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), tag, tag_len); chan->tag = tag; err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); if (err) { goto out_free_tag; } chan->vc_wq = kmalloc_obj(wait_queue_head_t); if (!chan->vc_wq) { err = -ENOMEM; goto out_remove_file; } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; /* Ceiling limit to avoid denial of service attacks */ chan->p9_max_pages = nr_free_buffer_pages()/4; virtio_device_ready(vdev); mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); mutex_unlock(&virtio_9p_lock); /* Let udev rules use the new mount_tag attribute. */ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); return 0; out_remove_file: sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); out_free_tag: kfree(tag); out_free_vq: vdev->config->del_vqs(vdev); out_free_chan: kfree(chan); fail: return err; } /** * p9_virtio_create - allocate a new virtio channel * @client: client instance invoking this transport * @fc: the filesystem context * * This sets up a transport channel for 9p communication. Right now * we only match the first available channel, but eventually we could look up * alternate channels by matching devname versus a virtio_config entry. * We use a simple reference count mechanism to ensure that only a single * mount has a channel open at a time. * */ static int p9_virtio_create(struct p9_client *client, struct fs_context *fc) { const char *devname = fc->source; struct virtio_chan *chan; int ret = -ENOENT; int found = 0; if (devname == NULL) return -EINVAL; mutex_lock(&virtio_9p_lock); list_for_each_entry(chan, &virtio_chan_list, chan_list) { if (!strcmp(devname, chan->tag)) { if (!chan->inuse) { chan->inuse = true; found = 1; break; } ret = -EBUSY; } } mutex_unlock(&virtio_9p_lock); if (!found) { pr_err("no channels available for device %s\n", devname); return ret; } client->trans = (void *)chan; client->status = Connected; chan->client = client; return 0; } /** * p9_virtio_remove - clean up resources associated with a virtio device * @vdev: virtio device to remove * */ static void p9_virtio_remove(struct virtio_device *vdev) { struct virtio_chan *chan = vdev->priv; unsigned long warning_time; mutex_lock(&virtio_9p_lock); /* Remove self from list so we don't get new users. */ list_del(&chan->chan_list); warning_time = jiffies; /* Wait for existing users to close. */ while (chan->inuse) { mutex_unlock(&virtio_9p_lock); msleep(250); if (time_after(jiffies, warning_time + 10 * HZ)) { dev_emerg(&vdev->dev, "p9_virtio_remove: waiting for device in use.\n"); warning_time = jiffies; } mutex_lock(&virtio_9p_lock); } mutex_unlock(&virtio_9p_lock); virtio_reset_device(vdev); vdev->config->del_vqs(vdev); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); kfree(chan->tag); kfree(chan->vc_wq); kfree(chan); } static struct virtio_device_id id_table[] = { { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID }, { 0 }, }; static unsigned int features[] = { VIRTIO_9P_MOUNT_TAG, }; /* The standard "struct lguest_driver": */ static struct virtio_driver p9_virtio_drv = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .id_table = id_table, .probe = p9_virtio_probe, .remove = p9_virtio_remove, }; static struct p9_trans_module p9_virtio_trans = { .name = "virtio", .create = p9_virtio_create, .close = p9_virtio_close, .request = p9_virtio_request, .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response * headers. We also skip one more entry to accommodate, address * that are not at page boundary, that can result in an extra * page in zero copy. */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), .pooled_rbuffers = false, .def = true, .supports_vmalloc = false, .owner = THIS_MODULE, }; /* The standard init function */ static int __init p9_virtio_init(void) { int rc; INIT_LIST_HEAD(&virtio_chan_list); v9fs_register_trans(&p9_virtio_trans); rc = register_virtio_driver(&p9_virtio_drv); if (rc) v9fs_unregister_trans(&p9_virtio_trans); return rc; } static void __exit p9_virtio_cleanup(void) { unregister_virtio_driver(&p9_virtio_drv); v9fs_unregister_trans(&p9_virtio_trans); } module_init(p9_virtio_init); module_exit(p9_virtio_cleanup); MODULE_ALIAS_9P("virtio"); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_DESCRIPTION("Virtio 9p Transport"); MODULE_LICENSE("GPL");
26 1 1 1 13 13 13 3 4 5 1 1 1 1 1 13 13 13 13 13 13 13 1727 5 1729 1728 16 13 13 13 13 1 1 1 1 1 1 1 3 1 1 2 1 1 1 1 1 97 97 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Interface registration. * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland * * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont * and Sakari Ailus <sakari.ailus@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/spinlock.h> #include <net/netns/generic.h> #include <net/net_namespace.h> #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support"); MODULE_LICENSE("GPL"); /* Used for local tracking of the CAIF net devices */ struct caif_device_entry { struct cflayer layer; struct list_head list; struct net_device *netdev; int __percpu *pcpu_refcnt; spinlock_t flow_lock; struct sk_buff *xoff_skb; void (*xoff_skb_dtor)(struct sk_buff *skb); bool xoff; }; struct caif_device_entry_list { struct list_head list; /* Protects simulanous deletes in list */ struct mutex lock; }; struct caif_net { struct cfcnfg *cfg; struct caif_device_entry_list caifdevs; }; static unsigned int caif_net_id; static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return &caifn->caifdevs; } static void caifd_put(struct caif_device_entry *e) { this_cpu_dec(*e->pcpu_refcnt); } static void caifd_hold(struct caif_device_entry *e) { this_cpu_inc(*e->pcpu_refcnt); } static int caifd_refcnt_read(struct caif_device_entry *e) { int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(e->pcpu_refcnt, i); return refcnt; } /* Allocate new CAIF device. */ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) { struct caif_device_entry *caifd; caifd = kzalloc_obj(*caifd); if (!caifd) return NULL; caifd->pcpu_refcnt = alloc_percpu(int); if (!caifd->pcpu_refcnt) { kfree(caifd); return NULL; } caifd->netdev = dev; dev_hold(dev); return caifd; } static struct caif_device_entry *caif_get(struct net_device *dev) { struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; list_for_each_entry_rcu(caifd, &caifdevs->list, list, lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } return NULL; } static void caif_flow_cb(struct sk_buff *skb) { struct caif_device_entry *caifd; void (*dtor)(struct sk_buff *skb) = NULL; bool send_xoff; WARN_ON(skb->dev == NULL); rcu_read_lock(); caifd = caif_get(skb->dev); WARN_ON(caifd == NULL); if (!caifd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) skb = NULL; caifd->xoff_skb = NULL; caifd->xoff_skb_dtor = NULL; spin_unlock_bh(&caifd->flow_lock); if (dtor && skb) dtor(skb); if (send_xoff) caifd->layer.up-> ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, caifd->layer.id); caifd_put(caifd); } static int transmit(struct cflayer *layer, struct cfpkt *pkt) { int err, high = 0, qlen = 0; struct caif_device_entry *caifd = container_of(layer, struct caif_device_entry, layer); struct sk_buff *skb; struct netdev_queue *txq; rcu_read_lock_bh(); skb = cfpkt_tonative(pkt); skb->dev = caifd->netdev; skb_reset_network_header(skb); skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { struct Qdisc *sch; /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); sch = rcu_dereference_bh(txq->qdisc); if (likely(qdisc_is_empty(sch))) goto noxoff; /* can check for explicit qdisc len value only !NOLOCK, * always set flow off otherwise */ high = (caifd->netdev->tx_queue_len * q_high) / 100; if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } /* Hold lock while accessing xoff */ spin_lock_bh(&caifd->flow_lock); if (caifd->xoff) { spin_unlock_bh(&caifd->flow_lock); goto noxoff; } /* * Handle flow off, we do this by temporary hi-jacking this * skb's destructor function, and replace it with our own * flow-on callback. The callback will set flow-on and call * the original destructor. */ pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; spin_unlock_bh(&caifd->flow_lock); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); noxoff: rcu_read_unlock_bh(); err = dev_queue_xmit(skb); if (err > 0) err = -EIO; return err; } /* * Stuff received packets into the CAIF stack. * On error, returns non-zero and releases the skb. */ static int receive(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct cfpkt *pkt; struct caif_device_entry *caifd; int err; pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->receive || !netif_oper_up(caifd->netdev)) { rcu_read_unlock(); kfree_skb(skb); return NET_RX_DROP; } /* Hold reference to netdevice while using CAIF stack */ caifd_hold(caifd); rcu_read_unlock(); err = caifd->layer.up->receive(caifd->layer.up, pkt); /* For -EILSEQ the packet is not freed so free it now */ if (err == -EILSEQ) cfpkt_destroy(pkt); /* Release reference to stack upwards */ caifd_put(caifd); if (err != 0) err = NET_RX_DROP; return err; } static struct packet_type caif_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_CAIF), .func = receive, }; static void dev_flowctrl(struct net_device *dev, int on) { struct caif_device_entry *caifd; rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, on ? _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND : _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); caifd_put(caifd); } int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct caif_device_entry *caifd; enum cfcnfg_phy_preference pref; struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); struct caif_device_entry_list *caifdevs; int res; caifdevs = caif_device_list(dev_net(dev)); caifd = caif_device_alloc(dev); if (!caifd) return -ENOMEM; *layer = &caifd->layer; spin_lock_init(&caifd->flow_lock); switch (caifdev->link_select) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: pref = CFPHYPREF_HIGH_BW; break; } mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); strscpy(caifd->layer.name, dev->name, sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; res = cfcnfg_add_phy_layer(cfg, dev, &caifd->layer, pref, link_support, caifdev->use_fcs, head_room); mutex_unlock(&caifdevs->lock); if (rcv_func) *rcv_func = receive; return res; } EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; struct cflayer *layer, *link_support; int head_room = 0; struct caif_device_entry_list *caifdevs; int res; cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) return 0; switch (what) { case NETDEV_REGISTER: if (caifd != NULL) break; caifdev = netdev_priv(dev); link_support = NULL; if (caifdev->use_frag) { head_room = 1; link_support = cfserl_create(dev->ifindex, caifdev->use_stx); if (!link_support) { pr_warn("Out of memory\n"); break; } } res = caif_enroll_dev(dev, caifdev, link_support, head_room, &layer, NULL); if (res) cfserl_release(link_support); caifdev->flowctrl = dev_flowctrl; break; case NETDEV_UP: rcu_read_lock(); caifd = caif_get(dev); if (caifd == NULL) { rcu_read_unlock(); break; } caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); break; case NETDEV_DOWN: rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return -EINVAL; } cfcnfg_set_phy_state(cfg, &caifd->layer, false); caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_DOWN_IND, caifd->layer.id); spin_lock_bh(&caifd->flow_lock); /* * Replace our xoff-destructor with original destructor. * We trust that skb->destructor *always* is called before * the skb reference is invalid. The hijacked SKB destructor * takes the flow_lock so manipulating the skb->destructor here * should be safe. */ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; spin_unlock_bh(&caifd->flow_lock); caifd_put(caifd); break; case NETDEV_UNREGISTER: mutex_lock(&caifdevs->lock); caifd = caif_get(dev); if (caifd == NULL) { mutex_unlock(&caifdevs->lock); break; } list_del_rcu(&caifd->list); /* * NETDEV_UNREGISTER is called repeatedly until all reference * counts for the net-device are released. If references to * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for * the next call to NETDEV_UNREGISTER. * * If any packets are in flight down the CAIF Stack, * cfcnfg_del_phy_layer will return nonzero. * If no packets are in flight, the CAIF Stack associated * with the net-device un-registering is freed. */ if (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) { pr_info("Wait for device inuse\n"); /* Enrole device if CAIF Stack is still in use */ list_add_rcu(&caifd->list, &caifdevs->list); mutex_unlock(&caifdevs->lock); break; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); mutex_unlock(&caifdevs->lock); break; } return 0; } static struct notifier_block caif_device_notifier = { .notifier_call = caif_device_notify, .priority = 0, }; /* Per-namespace Caif devices handling */ static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); caifn->cfg = cfcnfg_create(); if (!caifn->cfg) return -ENOMEM; return 0; } static void caif_exit_net(struct net *net) { struct caif_device_entry *caifd, *tmp; struct caif_device_entry_list *caifdevs = caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); rtnl_lock(); mutex_lock(&caifdevs->lock); list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { int i = 0; list_del_rcu(&caifd->list); cfcnfg_set_phy_state(cfg, &caifd->layer, false); while (i < 10 && (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) { pr_info("Wait for device inuse\n"); msleep(250); i++; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); } cfcnfg_remove(cfg); mutex_unlock(&caifdevs->lock); rtnl_unlock(); } static struct pernet_operations caif_net_ops = { .init = caif_init_net, .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), }; /* Initialize Caif devices list */ static int __init caif_device_init(void) { int result; result = register_pernet_subsys(&caif_net_ops); if (result) return result; register_netdevice_notifier(&caif_device_notifier); dev_add_pack(&caif_packet_type); return result; } static void __exit caif_device_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); unregister_pernet_subsys(&caif_net_ops); } module_init(caif_device_init); module_exit(caif_device_exit);
36 36 36 5 29 29 29 7 7 7 7 7 7 4 4 1 4 1 4 4 4 4 4 4 4 9 16 6 10 16 16 1 15 15 20 20 18 16 1 1 19 14 15 15 14 14 14 14 14 14 14 14 14 14 15 15 1 15 21 6 2 6 21 19 19 19 19 18 18 5 18 16 3 3 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 2005-2006 Ian Kent <raven@themaw.net> */ #include <linux/seq_file.h> #include <linux/pagemap.h> #include "autofs_i.h" struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; ino = kzalloc_obj(*ino); if (ino) { INIT_LIST_HEAD(&ino->active); INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; ino->exp_timeout = -1; ino->count = 1; } return ino; } void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->exp_timeout = -1; ino->last_used = jiffies; } void autofs_free_ino(struct autofs_info *ino) { kfree_rcu(ino, rcu); } void autofs_kill_sb(struct super_block *sb) { struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock * info is not present so nothing else has been setup, so * just call kill_anon_super when we are called from * deactivate_super. */ if (sbi) { /* Free wait queues, close pipe */ autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } pr_debug("shutting down\n"); kill_anon_super(sb); if (sbi) kfree_rcu(sbi, rcu); } static int autofs_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, root_inode->i_uid)); if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); if (autofs_type_offset(sbi->type)) seq_puts(m, ",offset"); else if (autofs_type_direct(sbi->type)) seq_puts(m, ",direct"); else seq_puts(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_puts(m, ",strictexpire"); if (sbi->flags & AUTOFS_SBI_IGNORE) seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%llu", file_inode(sbi->pipe)->i_ino); else seq_puts(m, ",pipe_ino=-1"); #endif return 0; } static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } static const struct super_operations autofs_sops = { .statfs = simple_statfs, .show_options = autofs_show_options, .evict_inode = autofs_evict_inode, }; enum { Opt_direct, Opt_fd, Opt_gid, Opt_ignore, Opt_indirect, Opt_maxproto, Opt_minproto, Opt_offset, Opt_pgrp, Opt_strictexpire, Opt_uid, }; const struct fs_parameter_spec autofs_param_specs[] = { fsparam_flag ("direct", Opt_direct), fsparam_fd ("fd", Opt_fd), fsparam_gid ("gid", Opt_gid), fsparam_flag ("ignore", Opt_ignore), fsparam_flag ("indirect", Opt_indirect), fsparam_u32 ("maxproto", Opt_maxproto), fsparam_u32 ("minproto", Opt_minproto), fsparam_flag ("offset", Opt_offset), fsparam_u32 ("pgrp", Opt_pgrp), fsparam_flag ("strictexpire", Opt_strictexpire), fsparam_uid ("uid", Opt_uid), {} }; struct autofs_fs_context { kuid_t uid; kgid_t gid; int pgrp; bool pgrp_set; }; /* * Open the fd. We do it here rather than in get_tree so that it's done in the * context of the system call that passed the data and not the one that * triggered the superblock creation, lest the fd gets reassigned. */ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, struct fs_parameter *param, struct fs_parse_result *result) { struct file *pipe; int ret; if (param->type == fs_value_is_file) { /* came through the new api */ pipe = param->file; param->file = NULL; } else { pipe = fget(result->uint_32); } if (!pipe) { errorf(fc, "could not open pipe file descriptor"); return -EBADF; } ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); fput(pipe); return -EBADF; } autofs_set_packet_pipe_flags(pipe); if (sbi->pipe) fput(sbi->pipe); sbi->pipefd = result->uint_32; sbi->pipe = pipe; return 0; } static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, autofs_param_specs, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_fd: return autofs_parse_fd(fc, sbi, param, &result); case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_pgrp: ctx->pgrp = result.uint_32; ctx->pgrp_set = true; break; case Opt_minproto: sbi->min_proto = result.uint_32; break; case Opt_maxproto: sbi->max_proto = result.uint_32; break; case Opt_indirect: set_autofs_type_indirect(&sbi->type); break; case Opt_direct: set_autofs_type_direct(&sbi->type); break; case Opt_offset: set_autofs_type_offset(&sbi->type); break; case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; case Opt_ignore: sbi->flags |= AUTOFS_SBI_IGNORE; } return 0; } static struct autofs_sb_info *autofs_alloc_sbi(void) { struct autofs_sb_info *sbi; sbi = kzalloc_obj(*sbi); if (!sbi) return NULL; sbi->magic = AUTOFS_SBI_MAGIC; sbi->flags = AUTOFS_SBI_CATATONIC; sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; sbi->pipefd = -1; sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; set_autofs_type_indirect(&sbi->type); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); spin_lock_init(&sbi->lookup_lock); INIT_LIST_HEAD(&sbi->active_list); INIT_LIST_HEAD(&sbi->expiring_list); return sbi; } static int autofs_validate_protocol(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; /* Test versions first */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { errorf(fc, "kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); return -EINVAL; } /* Establish highest kernel protocol version */ if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) sbi->version = AUTOFS_MAX_PROTO_VERSION; else sbi->version = sbi->max_proto; switch (sbi->version) { case 4: sbi->sub_version = 7; break; case 5: sbi->sub_version = AUTOFS_PROTO_SUBVERSION; break; default: sbi->sub_version = 0; } return 0; } static int autofs_fill_super(struct super_block *s, struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = s->s_fs_info; struct inode *root_inode; struct autofs_info *ino; pr_debug("starting up, sbi = %p\n", sbi); sbi->sb = s; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs_sops; set_default_d_op(s, &autofs_dentry_operations); s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ ino = autofs_new_ino(sbi); if (!ino) return -ENOMEM; root_inode = autofs_get_inode(s, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; root_inode->i_uid = ctx->uid; root_inode->i_gid = ctx->gid; root_inode->i_fop = &autofs_root_operations; root_inode->i_op = &autofs_dir_inode_operations; s->s_root = d_make_root(root_inode); if (unlikely(!s->s_root)) { autofs_free_ino(ino); return -ENOMEM; } s->s_root->d_fsdata = ino; if (ctx->pgrp_set) { sbi->oz_pgrp = find_get_pid(ctx->pgrp); if (!sbi->oz_pgrp) return invalf(fc, "Could not find process group %d", ctx->pgrp); } else sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); if (autofs_type_trigger(sbi->type)) /* s->s_root won't be contended so there's little to * be gained by not taking the d_lock when setting * d_flags, even when a lot mounts are being done. */ managed_dentry_set_managed(s->s_root); pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); sbi->flags &= ~AUTOFS_SBI_CATATONIC; return 0; } /* * Validate the parameters and then request a superblock. */ static int autofs_get_tree(struct fs_context *fc) { struct autofs_sb_info *sbi = fc->s_fs_info; int ret; ret = autofs_validate_protocol(fc); if (ret) return ret; if (sbi->pipefd < 0) return invalf(fc, "No control pipe specified"); return get_tree_nodev(fc, autofs_fill_super); } static void autofs_free_fc(struct fs_context *fc) { struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; if (sbi) { if (sbi->pipe) fput(sbi->pipe); kfree(sbi); } kfree(ctx); } static const struct fs_context_operations autofs_context_ops = { .free = autofs_free_fc, .parse_param = autofs_parse_param, .get_tree = autofs_get_tree, }; /* * Set up the filesystem mount context. */ int autofs_init_fs_context(struct fs_context *fc) { struct autofs_fs_context *ctx; struct autofs_sb_info *sbi; ctx = kzalloc_obj(struct autofs_fs_context); if (!ctx) goto nomem; ctx->uid = current_uid(); ctx->gid = current_gid(); sbi = autofs_alloc_sbi(); if (!sbi) goto nomem_ctx; fc->fs_private = ctx; fc->s_fs_info = sbi; fc->ops = &autofs_context_ops; return 0; nomem_ctx: kfree(ctx); nomem: return -ENOMEM; } struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (inode == NULL) return NULL; inode->i_mode = mode; if (sb->s_root) { inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } simple_inode_init_ts(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { set_nlink(inode, 2); inode->i_op = &autofs_dir_inode_operations; inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); return inode; }
45 38 14 10 41 41 41 41 44 38 35 35 38 48 70 74 70 53 54 53 53 5 3 3 9 6 6 1 3 3 4 2 2 2 2 2 82 82 78 4 107 107 106 105 105 107 4 24 8 8 8 2 7 6 6 3 7 24 2 2 1 1 1 5 9 5 5 6 10 6 6 101 99 99 99 99 99 17 82 99 99 99 99 102 20 2 2 2 21 3 4 3 2 1 11 13 4 3 6 5 172 4 5 3 2 8 1 156 113 10 82 29 26 5 28 29 29 1 83 83 83 83 2 82 82 82 82 82 82 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/device.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/devpts_fs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/ioctl.h> #include <linux/compat.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do {} while (0) #endif #ifdef CONFIG_UNIX98_PTYS static struct tty_driver *ptm_driver; static struct tty_driver *pts_driver; static DEFINE_MUTEX(devpts_mutex); #endif static void pty_close(struct tty_struct *tty, struct file *filp) { if (tty->driver->subtype == PTY_TYPE_MASTER) WARN_ON(tty->count > 1); else { if (tty_io_error(tty)) return; if (tty->count > 2) return; } set_bit(TTY_IO_ERROR, &tty->flags); wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); scoped_guard(spinlock_irq, &tty->ctrl.lock) tty->ctrl.packet = false; /* Review - krefs on tty_link ?? */ if (!tty->link) return; set_bit(TTY_OTHER_CLOSED, &tty->link->flags); wake_up_interruptible(&tty->link->read_wait); wake_up_interruptible(&tty->link->write_wait); if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { guard(mutex)(&devpts_mutex); if (tty->link->driver_data) devpts_pty_kill(tty->link->driver_data); } #endif tty_vhangup(tty->link); } } /* * The unthrottle routine is called by the line discipline to signal * that it can receive more characters. For PTY's, the TTY_THROTTLED * flag is always set, to force the line discipline to always call the * unthrottle routine when there are fewer than TTY_THRESHOLD_UNTHROTTLE * characters in the queue. This is necessary since each time this * happens, we need to wake up any sleeping processes that could be * (1) trying to send data to the pty, or (2) waiting in wait_until_sent() * for the pty buffer to be drained. */ static void pty_unthrottle(struct tty_struct *tty) { tty_wakeup(tty->link); set_bit(TTY_THROTTLED, &tty->flags); } /** * pty_write - write to a pty * @tty: the tty we write from * @buf: kernel buffer of data * @c: bytes to write * * Our "hardware" write method. Data is coming from the ldisc which * may be in a non sleeping state. We simply throw this at the other * end of the link as if we were an IRQ handler receiving stuff for * the other side of the pty/tty pair. */ static ssize_t pty_write(struct tty_struct *tty, const u8 *buf, size_t c) { struct tty_struct *to = tty->link; if (tty->flow.stopped || !c) return 0; return tty_insert_flip_string_and_push_buffer(to->port, buf, c); } /** * pty_write_room - write space * @tty: tty we are writing from * * Report how many bytes the ldisc can send into the queue for * the other device. */ static unsigned int pty_write_room(struct tty_struct *tty) { if (tty->flow.stopped) return 0; return tty_buffer_space_avail(tty->link->port); } /* Set the lock flag on a pty */ static int pty_set_lock(struct tty_struct *tty, int __user *arg) { int val; if (get_user(val, arg)) return -EFAULT; if (val) set_bit(TTY_PTY_LOCK, &tty->flags); else clear_bit(TTY_PTY_LOCK, &tty->flags); return 0; } static int pty_get_lock(struct tty_struct *tty, int __user *arg) { int locked = test_bit(TTY_PTY_LOCK, &tty->flags); return put_user(locked, arg); } /* Set the packet mode on a pty */ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg) { int want_pktmode; if (get_user(want_pktmode, arg)) return -EFAULT; guard(spinlock_irq)(&tty->ctrl.lock); if (!want_pktmode) { tty->ctrl.packet = false; return 0; } if (tty->ctrl.packet) return 0; tty->link->ctrl.pktstatus = 0; smp_mb(); tty->ctrl.packet = true; return 0; } /* Get the packet mode of a pty */ static int pty_get_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode = tty->ctrl.packet; return put_user(pktmode, arg); } /* Send a signal to the slave */ static int pty_signal(struct tty_struct *tty, int sig) { struct pid *pgrp; if (sig != SIGINT && sig != SIGQUIT && sig != SIGTSTP) return -EINVAL; if (tty->link) { pgrp = tty_get_pgrp(tty->link); if (pgrp) kill_pgrp(pgrp, sig, 1); put_pid(pgrp); } return 0; } static void pty_flush_buffer(struct tty_struct *tty) { struct tty_struct *to = tty->link; if (!to) return; tty_buffer_flush(to, NULL); if (to->ctrl.packet) { guard(spinlock_irq)(&tty->ctrl.lock); tty->ctrl.pktstatus |= TIOCPKT_FLUSHWRITE; wake_up_interruptible(&to->read_wait); } } static int pty_open(struct tty_struct *tty, struct file *filp) { if (!tty || !tty->link) return -ENODEV; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) goto out; if (test_bit(TTY_PTY_LOCK, &tty->link->flags)) goto out; if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1) goto out; clear_bit(TTY_IO_ERROR, &tty->flags); clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); return 0; out: set_bit(TTY_IO_ERROR, &tty->flags); return -EIO; } static void pty_set_termios(struct tty_struct *tty, const struct ktermios *old_termios) { /* See if packet mode change of state. */ if (tty->link && tty->link->ctrl.packet) { int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty); int old_flow = ((old_termios->c_iflag & IXON) && (old_termios->c_cc[VSTOP] == '\023') && (old_termios->c_cc[VSTART] == '\021')); int new_flow = (I_IXON(tty) && STOP_CHAR(tty) == '\023' && START_CHAR(tty) == '\021'); if ((old_flow != new_flow) || extproc) { scoped_guard(spinlock_irq, &tty->ctrl.lock) { if (old_flow != new_flow) { tty->ctrl.pktstatus &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP); if (new_flow) tty->ctrl.pktstatus |= TIOCPKT_DOSTOP; else tty->ctrl.pktstatus |= TIOCPKT_NOSTOP; } if (extproc) tty->ctrl.pktstatus |= TIOCPKT_IOCTL; } wake_up_interruptible(&tty->link->read_wait); } } tty->termios.c_cflag &= ~(CSIZE | PARENB); tty->termios.c_cflag |= (CS8 | CREAD); } /** * pty_resize - resize event * @tty: tty being resized * @ws: window size being set. * * Update the termios variables and send the necessary signals to * peform a terminal resize correctly */ static int pty_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp, *rpgrp; struct tty_struct *pty = tty->link; /* For a PTY we need to lock the tty side */ guard(mutex)(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) return 0; /* Signal the foreground process group of both ptys */ pgrp = tty_get_pgrp(tty); rpgrp = tty_get_pgrp(pty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); if (rpgrp != pgrp && rpgrp) kill_pgrp(rpgrp, SIGWINCH, 1); put_pid(pgrp); put_pid(rpgrp); tty->winsize = *ws; pty->winsize = *ws; /* Never used so will go away soon */ return 0; } /** * pty_start - start() handler * pty_stop - stop() handler * @tty: tty being flow-controlled * * Propagates the TIOCPKT status to the master pty. * * NB: only the master pty can be in packet mode so only the slave * needs start()/stop() handlers */ static void pty_start(struct tty_struct *tty) { if (!tty->link || !tty->link->ctrl.packet) return; scoped_guard(spinlock_irqsave, &tty->ctrl.lock) { tty->ctrl.pktstatus &= ~TIOCPKT_STOP; tty->ctrl.pktstatus |= TIOCPKT_START; } wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } static void pty_stop(struct tty_struct *tty) { if (!tty->link || !tty->link->ctrl.packet) return; scoped_guard(spinlock_irqsave, &tty->ctrl.lock) { tty->ctrl.pktstatus &= ~TIOCPKT_START; tty->ctrl.pktstatus |= TIOCPKT_STOP; } wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } /** * pty_common_install - set up the pty pair * @driver: the pty driver * @tty: the tty being instantiated * @legacy: true if this is BSD style * * Perform the initial set up for the tty/pty pair. Called from the * tty layer when the port is first opened. * * Locking: the caller must hold the tty_mutex */ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty, bool legacy) { struct tty_struct *o_tty; struct tty_port *ports[2]; int idx = tty->index; int retval = -ENOMEM; /* Opening the slave first has always returned -EIO */ if (driver->subtype != PTY_TYPE_MASTER) return -EIO; ports[0] = kmalloc_obj(**ports); ports[1] = kmalloc_obj(**ports); if (!ports[0] || !ports[1]) goto err; if (!try_module_get(driver->other->owner)) { /* This cannot in fact currently happen */ goto err; } o_tty = alloc_tty_struct(driver->other, idx); if (!o_tty) goto err_put_module; tty_set_lock_subclass(o_tty); lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE); if (legacy) { /* We always use new tty termios data so we can do this the easy way .. */ tty_init_termios(tty); tty_init_termios(o_tty); driver->other->ttys[idx] = o_tty; driver->ttys[idx] = tty; } else { memset(&tty->termios_locked, 0, sizeof(tty->termios_locked)); tty->termios = driver->init_termios; memset(&o_tty->termios_locked, 0, sizeof(tty->termios_locked)); o_tty->termios = driver->other->init_termios; } /* * Everything allocated ... set up the o_tty structure. */ tty_driver_kref_get(driver->other); /* Establish the links in both directions */ tty->link = o_tty; o_tty->link = tty; tty_port_init(ports[0]); tty_port_init(ports[1]); tty_buffer_set_limit(ports[0], 8192); tty_buffer_set_limit(ports[1], 8192); o_tty->port = ports[0]; tty->port = ports[1]; o_tty->port->itty = o_tty; tty_buffer_set_lock_subclass(o_tty->port); tty_driver_kref_get(driver); tty->count++; o_tty->count++; return 0; err_put_module: module_put(driver->other->owner); err: kfree(ports[0]); kfree(ports[1]); return retval; } static void pty_cleanup(struct tty_struct *tty) { tty_port_put(tty->port); } /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS static int pty_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, true); } static void pty_remove(struct tty_driver *driver, struct tty_struct *tty) { struct tty_struct *pair = tty->link; driver->ttys[tty->index] = NULL; if (pair) pair->driver->ttys[pair->index] = NULL; } static int pty_bsd_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *) arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); case TIOCGPTN: /* TTY returns ENOTTY, but glibc expects EINVAL here */ return -EINVAL; } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_bsd_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_bsd_ioctl(tty, cmd, (unsigned long)compat_ptr(arg)); } #else #define pty_bsd_compat_ioctl NULL #endif static int legacy_count = CONFIG_LEGACY_PTY_COUNT; /* * not really modular, but the easiest way to keep compat with existing * bootargs behaviour is to continue using module_param here. */ module_param(legacy_count, int, 0); /* * The master side of a pty can do TIOCSPTLCK and thus * has pty_bsd_ioctl. */ static const struct tty_operations master_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_bsd_ioctl, .compat_ioctl = pty_bsd_compat_ioctl, .cleanup = pty_cleanup, .resize = pty_resize, .remove = pty_remove }; static const struct tty_operations slave_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .cleanup = pty_cleanup, .resize = pty_resize, .start = pty_start, .stop = pty_stop, .remove = pty_remove }; static void __init legacy_pty_init(void) { struct tty_driver *pty_driver, *pty_slave_driver; if (legacy_count <= 0) return; pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); pty_driver->driver_name = "pty_master"; pty_driver->name = "pty"; pty_driver->major = PTY_MASTER_MAJOR; pty_driver->minor_start = 0; pty_driver->type = TTY_DRIVER_TYPE_PTY; pty_driver->subtype = PTY_TYPE_MASTER; pty_driver->init_termios = tty_std_termios; pty_driver->init_termios.c_iflag = 0; pty_driver->init_termios.c_oflag = 0; pty_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_driver->init_termios.c_lflag = 0; pty_driver->init_termios.c_ispeed = 38400; pty_driver->init_termios.c_ospeed = 38400; pty_driver->other = pty_slave_driver; tty_set_operations(pty_driver, &master_pty_ops_bsd); pty_slave_driver->driver_name = "pty_slave"; pty_slave_driver->name = "ttyp"; pty_slave_driver->major = PTY_SLAVE_MAJOR; pty_slave_driver->minor_start = 0; pty_slave_driver->type = TTY_DRIVER_TYPE_PTY; pty_slave_driver->subtype = PTY_TYPE_SLAVE; pty_slave_driver->init_termios = tty_std_termios; pty_slave_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_slave_driver->init_termios.c_ispeed = 38400; pty_slave_driver->init_termios.c_ospeed = 38400; pty_slave_driver->other = pty_driver; tty_set_operations(pty_slave_driver, &slave_pty_ops_bsd); if (tty_register_driver(pty_driver)) panic("Couldn't register pty driver"); if (tty_register_driver(pty_slave_driver)) panic("Couldn't register pty slave driver"); } #else static inline void legacy_pty_init(void) { } #endif /* Unix98 devices */ #ifdef CONFIG_UNIX98_PTYS static struct cdev ptmx_cdev; static struct file *ptm_open_peer_file(struct file *master, struct tty_struct *tty, int flags) { struct path path; struct file *file; /* Compute the slave's path */ path.mnt = devpts_mntget(master, tty->driver_data); if (IS_ERR(path.mnt)) return ERR_CAST(path.mnt); path.dentry = tty->link->driver_data; file = dentry_open(&path, flags, current_cred()); mntput(path.mnt); return file; } /** * ptm_open_peer - open the peer of a pty * @master: the open struct file of the ptmx device node * @tty: the master of the pty being opened * @flags: the flags for open * * Provide a race free way for userspace to open the slave end of a pty * (where they have the master fd and cannot access or trust the mount * namespace /dev/pts was mounted inside). */ int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { if (tty->driver != ptm_driver) return -EIO; return FD_ADD(flags, ptm_open_peer_file(master, tty, flags)); } static int pty_unix98_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *)arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCGPTN: /* Get PT Number */ return put_user(tty->index, (unsigned int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_unix98_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_unix98_ioctl(tty, cmd, cmd == TIOCSIG ? arg : (unsigned long)compat_ptr(arg)); } #else #define pty_unix98_compat_ioctl NULL #endif /** * ptm_unix98_lookup - find a pty master * @driver: ptm driver * @file: unused * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking. */ static struct tty_struct *ptm_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { /* Master must be open via /dev/ptmx */ return ERR_PTR(-EIO); } /** * pts_unix98_lookup - find a pty slave * @driver: pts driver * @file: file pointer to tty * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking for the tty pointer. */ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { guard(mutex)(&devpts_mutex); /* Master must be open before slave */ return devpts_get_priv(file->f_path.dentry) ? : ERR_PTR(-EIO); } static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, false); } /* this is called once with whichever end is closed last */ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) fsi = tty->driver_data; else fsi = tty->link->driver_data; if (fsi) { devpts_kill_index(fsi, tty->index); devpts_release(fsi); } } static void pty_show_fdinfo(struct tty_struct *tty, struct seq_file *m) { seq_printf(m, "tty-index:\t%d\n", tty->index); } static const struct tty_operations ptm_unix98_ops = { .lookup = ptm_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_unix98_ioctl, .compat_ioctl = pty_unix98_compat_ioctl, .resize = pty_resize, .cleanup = pty_cleanup, .show_fdinfo = pty_show_fdinfo, }; static const struct tty_operations pty_unix98_ops = { .lookup = pts_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .start = pty_start, .stop = pty_stop, .cleanup = pty_cleanup, }; /** * ptmx_open - open a unix 98 pty master * @inode: inode of device file * @filp: file pointer to tty * * Allocate a unix98 pty master device from the ptmx driver. * * Locking: tty_mutex protects the init_dev work. tty->count should * protect the rest. * allocated_ptys_lock handles the list of free pty numbers */ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; struct dentry *dentry; int retval; int index; nonseekable_open(inode, filp); /* We refuse fsnotify events on ptmx, since it's a shared resource */ file_set_fsnotify_mode(filp, FMODE_NONOTIFY); retval = tty_alloc_file(filp); if (retval) return retval; fsi = devpts_acquire(filp); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; } /* find a device that is not in use. */ scoped_guard(mutex, &devpts_mutex) index = devpts_new_index(fsi); retval = index; if (index < 0) goto out_put_fsi; /* The tty returned here is locked so we can safely drop the mutex */ scoped_guard(mutex, &tty_mutex) tty = tty_init_dev(ptm_driver, index); retval = PTR_ERR(tty); if (IS_ERR(tty)) goto out; /* * From here on out, the tty is "live", and the index and * fsi will be killed/put by the tty_release() */ set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ tty->driver_data = fsi; tty_add_file(tty, filp); dentry = devpts_pty_new(fsi, index, tty->link); if (IS_ERR(dentry)) { retval = PTR_ERR(dentry); goto err_release; } tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) goto err_release; tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); tty_unlock(tty); return 0; err_release: tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); return retval; out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); out_free_file: tty_free_file(filp); return retval; } static struct file_operations ptmx_fops __ro_after_init; static void __init unix98_pty_init(void) { ptm_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(ptm_driver)) panic("Couldn't allocate Unix98 ptm driver"); pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pts_driver)) panic("Couldn't allocate Unix98 pts driver"); ptm_driver->driver_name = "pty_master"; ptm_driver->name = "ptm"; ptm_driver->major = UNIX98_PTY_MASTER_MAJOR; ptm_driver->minor_start = 0; ptm_driver->type = TTY_DRIVER_TYPE_PTY; ptm_driver->subtype = PTY_TYPE_MASTER; ptm_driver->init_termios = tty_std_termios; ptm_driver->init_termios.c_iflag = 0; ptm_driver->init_termios.c_oflag = 0; ptm_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; ptm_driver->init_termios.c_lflag = 0; ptm_driver->init_termios.c_ispeed = 38400; ptm_driver->init_termios.c_ospeed = 38400; ptm_driver->other = pts_driver; tty_set_operations(ptm_driver, &ptm_unix98_ops); pts_driver->driver_name = "pty_slave"; pts_driver->name = "pts"; pts_driver->major = UNIX98_PTY_SLAVE_MAJOR; pts_driver->minor_start = 0; pts_driver->type = TTY_DRIVER_TYPE_PTY; pts_driver->subtype = PTY_TYPE_SLAVE; pts_driver->init_termios = tty_std_termios; pts_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pts_driver->init_termios.c_ispeed = 38400; pts_driver->init_termios.c_ospeed = 38400; pts_driver->other = ptm_driver; tty_set_operations(pts_driver, &pty_unix98_ops); if (tty_register_driver(ptm_driver)) panic("Couldn't register Unix98 ptm driver"); if (tty_register_driver(pts_driver)) panic("Couldn't register Unix98 pts driver"); /* Now create the /dev/ptmx special device */ tty_default_fops(&ptmx_fops); ptmx_fops.open = ptmx_open; cdev_init(&ptmx_cdev, &ptmx_fops); if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) panic("Couldn't register /dev/ptmx driver"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); } #else static inline void unix98_pty_init(void) { } #endif static int __init pty_init(void) { legacy_pty_init(); unix98_pty_init(); return 0; } device_initcall(pty_init);
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /* SPDX-License-Identifier: GPL-2.0-only */ /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #ifndef _VMCI_QUEUE_PAIR_H_ #define _VMCI_QUEUE_PAIR_H_ #include <linux/vmw_vmci_defs.h> #include <linux/types.h> #include "vmci_context.h" /* Callback needed for correctly waiting on events. */ typedef int (*vmci_event_release_cb) (void *client_data); /* Guest device port I/O. */ struct ppn_set { u64 num_produce_pages; u64 num_consume_pages; u64 *produce_ppns; u64 *consume_ppns; bool initialized; }; /* VMCIqueue_pairAllocInfo */ struct vmci_qp_alloc_info { struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u64 ppn_va; /* Start VA of queue pair PPNs. */ u64 num_ppns; s32 result; u32 version; }; /* VMCIqueue_pairSetVAInfo */ struct vmci_qp_set_va_info { struct vmci_handle handle; u64 va; /* Start VA of queue pair PPNs. */ u64 num_ppns; u32 version; s32 result; }; /* * For backwards compatibility, here is a version of the * VMCIqueue_pairPageFileInfo before host support end-points was added. * Note that the current version of that structure requires VMX to * pass down the VA of the mapped file. Before host support was added * there was nothing of the sort. So, when the driver sees the ioctl * with a parameter that is the sizeof * VMCIqueue_pairPageFileInfo_NoHostQP then it can infer that the version * of VMX running can't attach to host end points because it doesn't * provide the VA of the mapped files. * * The Linux driver doesn't get an indication of the size of the * structure passed down from user space. So, to fix a long standing * but unfiled bug, the _pad field has been renamed to version. * Existing versions of VMX always initialize the PageFileInfo * structure so that _pad, er, version is set to 0. * * A version value of 1 indicates that the size of the structure has * been increased to include two UVA's: produce_uva and consume_uva. * These UVA's are of the mmap()'d queue contents backing files. * * In addition, if when VMX is sending down the * VMCIqueue_pairPageFileInfo structure it gets an error then it will * try again with the _NoHostQP version of the file to see if an older * VMCI kernel module is running. */ /* VMCIqueue_pairPageFileInfo */ struct vmci_qp_page_file_info { struct vmci_handle handle; u64 produce_page_file; /* User VA. */ u64 consume_page_file; /* User VA. */ u64 produce_page_file_size; /* Size of the file name array. */ u64 consume_page_file_size; /* Size of the file name array. */ s32 result; u32 version; /* Was _pad. */ u64 produce_va; /* User VA of the mapped file. */ u64 consume_va; /* User VA of the mapped file. */ }; /* vmci queuepair detach info */ struct vmci_qp_dtch_info { struct vmci_handle handle; s32 result; u32 _pad; }; /* * struct vmci_qp_page_store describes how the memory of a given queue pair * is backed. When the queue pair is between the host and a guest, the * page store consists of references to the guest pages. On vmkernel, * this is a list of PPNs, and on hosted, it is a user VA where the * queue pair is mapped into the VMX address space. */ struct vmci_qp_page_store { /* Reference to pages backing the queue pair. */ u64 pages; /* Length of pageList/virtual address range (in pages). */ u32 len; }; /* * This data type contains the information about a queue. * There are two queues (hence, queue pairs) per transaction model between a * pair of end points, A & B. One queue is used by end point A to transmit * commands and responses to B. The other queue is used by B to transmit * commands and responses. * * struct vmci_queue_kern_if is a per-OS defined Queue structure. It contains * either a direct pointer to the linear address of the buffer contents or a * pointer to structures which help the OS locate those data pages. See * vmciKernelIf.c for each platform for its definition. */ struct vmci_queue { struct vmci_queue_header *q_header; struct vmci_queue_header *saved_header; struct vmci_queue_kern_if *kernel_if; }; /* * Utility function that checks whether the fields of the page * store contain valid values. * Result: * true if the page store is wellformed. false otherwise. */ static inline bool VMCI_QP_PAGESTORE_IS_WELLFORMED(struct vmci_qp_page_store *page_store) { return page_store->len >= 2; } void vmci_qp_broker_exit(void); int vmci_qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context); int vmci_qp_broker_set_page_store(struct vmci_handle handle, u64 produce_uva, u64 consume_uva, struct vmci_ctx *context); int vmci_qp_broker_detach(struct vmci_handle handle, struct vmci_ctx *context); void vmci_qp_guest_endpoints_exit(void); int vmci_qp_alloc(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, bool guest_endpoint, vmci_event_release_cb wakeup_cb, void *client_data); int vmci_qp_broker_map(struct vmci_handle handle, struct vmci_ctx *context, u64 guest_mem); int vmci_qp_broker_unmap(struct vmci_handle handle, struct vmci_ctx *context, u32 gid); #endif /* _VMCI_QUEUE_PAIR_H_ */
6 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGE_FRAG_CACHE_H #define _LINUX_PAGE_FRAG_CACHE_H #include <linux/bits.h> #include <linux/log2.h> #include <linux/mm_types_task.h> #include <linux/types.h> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) /* Use a full byte here to enable assembler optimization as the shift * operation is usually expecting a byte. */ #define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) #else /* Compiler should be able to figure out we don't read things as any value * ANDed with 0 is 0. */ #define PAGE_FRAG_CACHE_ORDER_MASK 0 #endif #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT (PAGE_FRAG_CACHE_ORDER_MASK + 1) static inline bool encoded_page_decode_pfmemalloc(unsigned long encoded_page) { return !!(encoded_page & PAGE_FRAG_CACHE_PFMEMALLOC_BIT); } static inline void page_frag_cache_init(struct page_frag_cache *nc) { nc->encoded_page = 0; } static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) { return encoded_page_decode_pfmemalloc(nc->encoded_page); } void page_frag_cache_drain(struct page_frag_cache *nc); void __page_frag_cache_drain(struct page *page, unsigned int count); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask); static inline void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); } static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); } void page_frag_free(void *addr); #endif
1296 1296 1298 1298 1094 1131 1245 1089 306 37 6 6 4 3 1249 1251 1248 1087 1 1091 201 1 200 972 1252 144 1 1 1 1 1 396 1 397 398 397 16 5 1 4 5 4 1 3 1 2 3 2 1 139 2 138 139 138 138 137 734 743 136 737 144 136 135 132 7 144 134 134 1 136 1 136 1 135 137 135 1 134 1 133 135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: GPL-2.0-or-later /* Filesystem parameter parser. * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/namei.h> #include "internal.h" static const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, { "no", false }, { "true", true }, { "yes", true }, { }, }; static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) { for ( ; tbl->name; tbl++) if (strcmp(name, tbl->name) == 0) return tbl; return NULL; } /** * lookup_constant - Look up a constant by name in an ordered table * @tbl: The table of constants to search. * @name: The name to look up. * @not_found: The value to return if the name is not found. */ int lookup_constant(const struct constant_table *tbl, const char *name, int not_found) { const struct constant_table *p = __lookup_constant(tbl, name); return p ? p->value : not_found; } EXPORT_SYMBOL(lookup_constant); static inline bool is_flag(const struct fs_parameter_spec *p) { return p->type == NULL; } static const struct fs_parameter_spec *fs_lookup_key( const struct fs_parameter_spec *desc, struct fs_parameter *param, bool *negated) { const struct fs_parameter_spec *p, *other = NULL; const char *name = param->key; bool want_flag = param->type == fs_value_is_flag; *negated = false; for (p = desc; p->name; p++) { if (strcmp(p->name, name) != 0) continue; if (likely(is_flag(p) == want_flag)) return p; other = p; } if (want_flag) { if (name[0] == 'n' && name[1] == 'o' && name[2]) { for (p = desc; p->name; p++) { if (strcmp(p->name, name + 2) != 0) continue; if (!(p->flags & fs_param_neg_with_no)) continue; *negated = true; return p; } } } return other; } /* * __fs_parse - Parse a filesystem configuration parameter * @log: The filesystem context to log errors through. * @desc: The parameter description to use. * @param: The parameter. * @result: Where to place the result of the parse * * Parse a filesystem configuration parameter and attempt a conversion for a * simple parameter for which this is requested. If successful, the determined * parameter ID is placed into @result->key, the desired type is indicated in * @result->t and any converted value is placed into an appropriate member of * the union in @result. * * The function returns the parameter number if the parameter was matched, * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that * unknown parameters are okay and -EINVAL if there was a conversion issue or * the parameter wasn't recognised and unknowns aren't okay. */ int __fs_parse(struct p_log *log, const struct fs_parameter_spec *desc, struct fs_parameter *param, struct fs_parse_result *result) { const struct fs_parameter_spec *p; result->uint_64 = 0; p = fs_lookup_key(desc, param, &result->negated); if (!p) return -ENOPARAM; if (p->flags & fs_param_deprecated) warn_plog(log, "Deprecated parameter '%s'", param->key); /* Try to turn the type we were given into the type desired by the * parameter and give an error if we can't. */ if (is_flag(p)) { if (param->type != fs_value_is_flag) return inval_plog(log, "Unexpected value for '%s'", param->key); result->boolean = !result->negated; } else { int ret = p->type(log, p, param, result); if (ret) return ret; } return p->opt; } EXPORT_SYMBOL(__fs_parse); /** * fs_lookup_param - Look up a path referred to by a parameter * @fc: The filesystem context to log errors through. * @param: The parameter. * @want_bdev: T if want a blockdev * @flags: Pathwalk flags passed to filename_lookup() * @_path: The result of the lookup */ int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, unsigned int flags, struct path *_path) { struct filename *f; bool put_f; int ret; switch (param->type) { case fs_value_is_string: f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: f = param->name; put_f = false; break; default: return invalf(fc, "%s: not usable as path", param->key); } ret = filename_lookup(param->dirfd, f, flags, _path, NULL); if (ret < 0) { errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); goto out; } if (want_bdev && !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { path_put(_path); _path->dentry = NULL; _path->mnt = NULL; errorf(fc, "%s: Non-blockdev passed as '%s'", param->key, f->name); ret = -ENOTBLK; } out: if (put_f) putname(f); return ret; } EXPORT_SYMBOL(fs_lookup_param); static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param) { return inval_plog(log, "Bad value for '%s'", param->key); } int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int b; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; b = lookup_constant(bool_names, param->string, -1); if (b == -1) return fs_param_bad_value(log, param); result->boolean = b; return 0; } EXPORT_SYMBOL(fs_param_is_bool); int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { int base = (unsigned long)p->data; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtouint(param->string, base, &result->uint_32) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_u32); int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtoint(param->string, 0, &result->int_32) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_s32); int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; if (kstrtoull(param->string, 0, &result->uint_64) < 0) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_u64); int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { const struct constant_table *c; if (param->type != fs_value_is_string) return fs_param_bad_value(log, param); if (!*param->string && (p->flags & fs_param_can_be_empty)) return 0; c = __lookup_constant(p->data, param->string); if (!c) return fs_param_bad_value(log, param); result->uint_32 = c->value; return 0; } EXPORT_SYMBOL(fs_param_is_enum); int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { if (param->type != fs_value_is_string || (!*param->string && !(p->flags & fs_param_can_be_empty))) return fs_param_bad_value(log, param); return 0; } EXPORT_SYMBOL(fs_param_is_string); int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { switch (param->type) { case fs_value_is_string: if ((!*param->string && !(p->flags & fs_param_can_be_empty)) || kstrtouint(param->string, 0, &result->uint_32) < 0) break; if (result->uint_32 <= INT_MAX) return 0; break; case fs_value_is_file: result->uint_32 = param->dirfd; if (result->uint_32 <= INT_MAX) return 0; break; default: break; } return fs_param_bad_value(log, param); } EXPORT_SYMBOL(fs_param_is_fd); int fs_param_is_file_or_string(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { switch (param->type) { case fs_value_is_string: return fs_param_is_string(log, p, param, result); case fs_value_is_file: result->uint_32 = param->dirfd; if (result->uint_32 <= INT_MAX) return 0; break; default: break; } return fs_param_bad_value(log, param); } EXPORT_SYMBOL(fs_param_is_file_or_string); int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { kuid_t uid; if (fs_param_is_u32(log, p, param, result) != 0) return fs_param_bad_value(log, param); uid = make_kuid(current_user_ns(), result->uint_32); if (!uid_valid(uid)) return inval_plog(log, "Invalid uid '%s'", param->string); result->uid = uid; return 0; } EXPORT_SYMBOL(fs_param_is_uid); int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { kgid_t gid; if (fs_param_is_u32(log, p, param, result) != 0) return fs_param_bad_value(log, param); gid = make_kgid(current_user_ns(), result->uint_32); if (!gid_valid(gid)) return inval_plog(log, "Invalid gid '%s'", param->string); result->gid = gid; return 0; } EXPORT_SYMBOL(fs_param_is_gid); int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { return 0; } EXPORT_SYMBOL(fs_param_is_blockdev); #ifdef CONFIG_VALIDATE_FS_PARSER /** * fs_validate_description - Validate a parameter specification array * @name: Owner name of the parameter specification array * @desc: The parameter specification array to validate. */ bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc) { const struct fs_parameter_spec *param, *p2; bool good = true; for (param = desc; param->name; param++) { /* Check for duplicate parameter names */ for (p2 = desc; p2 < param; p2++) { if (strcmp(param->name, p2->name) == 0) { if (is_flag(param) != is_flag(p2)) continue; pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", name, param->name); good = false; } } } return good; } #endif /* CONFIG_VALIDATE_FS_PARSER */
1001 8 11 21 1 177 76 186 30 356 177 2479 17 57 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_H #define _IPV6_H #include <uapi/linux/ipv6.h> #include <linux/cache.h> #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) #define ipv6_authlen(p) (((p)->hdrlen+2) << 2) /* * This structure contains configuration options per IPv6 link. */ struct ipv6_devconf { /* RX & TX fastpath fields. */ __cacheline_group_begin(ipv6_devconf_read_txrx); __s32 disable_ipv6; __s32 hop_limit; __s32 mtu6; __s32 forwarding; __s32 force_forwarding; __s32 disable_policy; __s32 proxy_ndp; __cacheline_group_end(ipv6_devconf_read_txrx); __s32 accept_ra; __s32 accept_redirects; __s32 autoconf; __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; __s32 mldv2_unsolicited_report_interval; __s32 use_tempaddr; __s32 temp_valid_lft; __s32 temp_prefered_lft; __s32 regen_min_advance; __s32 regen_max_retry; __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; __s32 accept_ra_min_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif __s32 accept_source_route; __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE atomic_t mc_forwarding; #endif __s32 drop_unicast_in_l2_multicast; __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; __s32 accept_untracked_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; __s32 keep_addr_on_down; __s32 seg6_enabled; #ifdef CONFIG_IPV6_SEG6_HMAC __s32 seg6_require_hmac; #endif __u32 enhanced_dad; __u32 addr_gen_mode; __s32 ndisc_tclass; __s32 rpl_seg_enabled; __u32 ioam6_id; __u32 ioam6_id_wide; __u8 ioam6_enabled; __u8 ndisc_evict_nocarrier; __u8 ra_honor_pio_life; __u8 ra_honor_pio_pflag; struct ctl_table_header *sysctl_header; }; struct ipv6_params { __s32 disable_ipv6; __s32 autoconf; }; extern struct ipv6_params ipv6_defaults; #include <linux/tcp.h> #include <linux/udp.h> #include <net/inet_sock.h> static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_network_header(skb); } static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_inner_network_header(skb); } static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_transport_header(skb); } static inline unsigned int ipv6_transport_len(const struct sk_buff *skb) { return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) - skb_network_header_len(skb); } static inline unsigned int ipv6_payload_len(const struct sk_buff *skb, const struct ipv6hdr *ip6) { u32 len = ntohs(ip6->payload_len); return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? len : skb->len - skb_network_offset(skb) - sizeof(struct ipv6hdr); } static inline unsigned int skb_ipv6_payload_len(const struct sk_buff *skb) { return ipv6_payload_len(skb, ipv6_hdr(skb)); } #define IPV6_MAXPLEN 65535 static inline void ipv6_set_payload_len(struct ipv6hdr *ip6, unsigned int len) { ip6->payload_len = len <= IPV6_MAXPLEN ? htons(len) : 0; } /* This structure contains results of exthdrs parsing as offsets from skb->nh. */ struct inet6_skb_parm { int iif; __be16 ra; __u16 dst0; __u16 srcrt; __u16 dst1; __u16 lastopt; __u16 nhoff; __u16 flags; #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) __u16 dsthao; #endif __u16 frag_max_size; __u16 srhoff; #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 #define IP6SKB_MULTIPATH 1024 #define IP6SKB_MCROUTE 2048 }; #if defined(CONFIG_NET_L3_MASTER_DEV) static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } #endif #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) #define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb)) static inline int inet6_iif(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } static inline bool inet6_is_jumbogram(const struct sk_buff *skb) { return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); } /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline int inet6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) return IP6CB(skb)->iif; #endif return 0; } struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; struct ipv6_mc_socklist; struct ipv6_ac_socklist; struct ipv6_fl_socklist; /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ struct in6_addr saddr; union { struct in6_addr daddr; struct in6_addr final; }; __be32 flow_label; u32 dst_cookie; struct ipv6_txoptions __rcu *opt; s16 hop_limit; u8 pmtudisc; u8 tclass; #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif bool daddr_cache; u8 mcast_hops; u32 frag_size; int ucast_oif; int mcast_oif; /* pktoption flags */ union { struct { u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, rxhlim:1, rxohlim:1, hopopts:1, ohopopts:1, dstopts:1, odstopts:1, rxflow:1, rxtclass:1, rxpmtu:1, rxorigdstaddr:1, recvfragsize:1; /* 1 bits hole */ } bits; u16 all; } rxopt; /* sockopt flags */ u8 srcprefs; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ u8 min_hopcount; __be32 rcv_flowinfo; struct in6_pktinfo sticky_pktinfo; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; }; /* We currently use available bits from inet_sk(sk)->inet_flags, * this could change in the future. */ #define inet6_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_set_bit(nr, sk) \ set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_clear_bit(nr, sk) \ clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */ struct raw6_sock { /* inet_sock has to be the first member of raw6_sock */ struct inet_sock inet; __u32 checksum; /* perform checksum */ __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; struct numa_drop_counters drop_counters; struct ipv6_pinfo inet6; }; struct udp6_sock { struct udp_sock udp; struct ipv6_pinfo inet6; }; struct tcp6_sock { struct tcp_sock tcp; struct ipv6_pinfo inet6; }; extern int inet6_sk_rebuild_header(struct sock *sk); struct tcp6_timewait_sock { struct tcp_timewait_sock tcp6tw_tcp; }; #if IS_ENABLED(CONFIG_IPV6) extern int disable_ipv6_mod; static inline bool ipv6_mod_enabled(void) { return disable_ipv6_mod == 0; } static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL; } #define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk) #define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ inet6_sk(sk)->rxopt.bits.rxinfo) static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { if (sk->sk_family == AF_INET6) return &sk->sk_v6_rcv_saddr; return NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) { /* ipv6only field is at same position for timewait and other sockets */ return ipv6_only_sock(sk); } #else #define ipv6_only_sock(sk) 0 #define ipv6_sk_rxinfo(sk) 0 static inline bool ipv6_mod_enabled(void) { return false; } static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { return NULL; } static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } #define inet6_rcv_saddr(__sk) NULL #define inet_v6_ipv6only(__sk) 0 #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _IPV6_H */
92 68 68 91 3 3 4 34 5 35 13 20 16 2 35 35 35 10 22 22 16 250 4 250 19 7 4 4 3 1 116 250 123 124 250 246 250 4 249 3 247 123 11 4 124 121 9 9 9 9 3 1 3 1 3 1 1 225 226 225 225 139 11 4 24 4 36 6 4 1 2 2 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 9 76 27 56 2 76 1 1 1 1 1 1 1 1 42 42 42 3 42 40 40 40 40 38 2 40 59 57 2 57 57 57 57 57 16 2 8 36 57 14 3 7 8 6 5 14 12 5 26 57 57 57 3 3 100 100 100 84 84 100 100 55 55 7 2 2 21 6 21 21 14 14 2 1 13 2 2 2 2 2 2 2 2 2 8 5 1 3 3 1 1 2 6 3 8 4 1 4 21 95 15 53 7 7 45 1 6 96 9 9 3 2 12 12 8 12 1 1 1 1 1 1 3 3 2 2 2 2 2 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 2 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 33 1 34 33 33 1 22 22 22 1 1 1 1 1 1 1 1 1 1 2 2 1 1 6 6 5 1 1 1 1 9 1 1 1 1 1 1 1 1 1 1 1 1 21 21 21 16 1 22 22 17 17 136 139 139 11 6 6 6 5 1 4 13 7 8 1 1 12 8 1 7 13 106 40 1 50 60 60 5 5 5 4 4 4 4 2 2 2 21 21 21 21 33 33 1 1 4 2 4 1 4 7 19 19 19 19 1 1 5 4 1 1 99 106 5 5 4 1 5 1 4 3 4 4 4 3 1 1 226 226 139 136 226 139 9 6 6 1 2 1 9 4 1 20 11 38 139 138 8 8 3 5 136 42 42 39 5 5 128 136 113 139 80 84 72 5 3 3 2 6 4 2 4 2 4 1 3 2 1 1 9 9 84 138 139 3 1 1 139 136 11 139 88 106 56 54 1 54 54 54 54 3 106 59 59 58 3 58 53 106 105 105 204 105 6 1 1 8 8 1 7 1 226 134 1 1 134 133 59 59 58 59 59 59 59 8 7 7 7 58 58 57 56 34 57 57 7 1 130 47 42 127 6 127 116 51 29 1 104 115 116 6 116 110 10 3 3 99 4 4 98 96 90 86 17 90 11 90 6 4 2 6 6 6 2 5 85 85 61 21 22 72 88 134 10 5 2 3 4 2 2 1 1 1 3 1 1 89 89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 // SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * emulate.c * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * * Copyright (c) 2005 Keir Fraser * * Linux coding style, mod r/m decoder, segment base fixes, real-mode * privileged instructions: * * Copyright (C) 2006 Qumranet * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include <linux/stringify.h> #include <asm/debugreg.h> #include <asm/nospec-branch.h> #include <asm/ibt.h> #include <asm/text-patching.h> #include "x86.h" #include "tss.h" #include "mmu.h" #include "pmu.h" /* * Operand types */ #define OpNone 0ull #define OpImplicit 1ull /* No generic decode */ #define OpReg 2ull /* Register */ #define OpMem 3ull /* Memory */ #define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */ #define OpDI 5ull /* ES:DI/EDI/RDI */ #define OpMem64 6ull /* Memory, 64-bit */ #define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ #define OpDX 8ull /* DX register */ #define OpCL 9ull /* CL register (for shifts) */ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ #define OpImm 12ull /* Sign extended up to 32-bit immediate */ #define OpMem16 13ull /* Memory operand (16-bit). */ #define OpMem32 14ull /* Memory operand (32-bit). */ #define OpImmU 15ull /* Immediate operand, zero extended */ #define OpSI 16ull /* SI/ESI/RSI */ #define OpImmFAddr 17ull /* Immediate far address */ #define OpMemFAddr 18ull /* Far address in memory */ #define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ #define OpES 20ull /* ES */ #define OpCS 21ull /* CS */ #define OpSS 22ull /* SS */ #define OpDS 23ull /* DS */ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ #define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ #define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */ #define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) /* * Opcode effective-address decode tables. * Note that we only emulate instructions that have at least one memory * operand (excluding implicit stack references). We assume that stack * references and instruction fetches will never occur in special memory * areas that require emulation. So, for example, 'mov <imm>,<reg>' need * not be handled. */ /* Operand sizes: 8-bit operands or specified/overridden size. */ #define ByteOp (1<<0) /* 8-bit operands. */ #define DstShift 1 /* Destination operand type at bits 1-5 */ #define ImplicitOps (OpImplicit << DstShift) #define DstReg (OpReg << DstShift) #define DstMem (OpMem << DstShift) #define DstAcc (OpAcc << DstShift) #define DstDI (OpDI << DstShift) #define DstMem64 (OpMem64 << DstShift) #define DstMem16 (OpMem16 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) #define SrcShift 6 /* Source operand type at bits 6-10 */ #define SrcNone (OpNone << SrcShift) #define SrcReg (OpReg << SrcShift) #define SrcMem (OpMem << SrcShift) #define SrcMem16 (OpMem16 << SrcShift) #define SrcMem32 (OpMem32 << SrcShift) #define SrcImm (OpImm << SrcShift) #define SrcImmByte (OpImmByte << SrcShift) #define SrcOne (OpOne << SrcShift) #define SrcImmUByte (OpImmUByte << SrcShift) #define SrcImmU (OpImmU << SrcShift) #define SrcSI (OpSI << SrcShift) #define SrcXLat (OpXLat << SrcShift) #define SrcImmFAddr (OpImmFAddr << SrcShift) #define SrcMemFAddr (OpMemFAddr << SrcShift) #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) #define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) #define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ #define Stack (1<<14) /* Stack instruction (push/pop) */ #define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */ #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ #define ModRM (1<<19) /* Generic ModRM decode. */ #define Mov (1<<20) /* Destination is only written; never read. */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */ #define PageTable (1 << 29) /* instruction used to write page table */ #define NotImpl (1 << 30) /* instruction is not implemented */ #define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */ #define Src2Shift (32) /* Source 2 operand type at bits 32-36 */ #define Src2None (OpNone << Src2Shift) #define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) #define Src2One (OpOne << Src2Shift) #define Src2Imm (OpImm << Src2Shift) #define Src2ES (OpES << Src2Shift) #define Src2CS (OpCS << Src2Shift) #define Src2SS (OpSS << Src2Shift) #define Src2DS (OpDS << Src2Shift) #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) /* free: 37-39 */ #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ #define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ #define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ /* free: 43-44 */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ #define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) #define X2(x...) x, x #define X3(x...) X2(x), x #define X4(x...) X2(x), X2(x) #define X5(x...) X4(x), x #define X6(x...) X4(x), X2(x) #define X7(x...) X4(x), X3(x) #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) struct opcode { u64 flags; u8 intercept; u8 pad[7]; union { int (*execute)(struct x86_emulate_ctxt *ctxt); const struct opcode *group; const struct group_dual *gdual; const struct gprefix *gprefix; const struct escape *esc; const struct instr_dual *idual; const struct mode_dual *mdual; } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; struct group_dual { struct opcode mod012[8]; struct opcode mod3[8]; }; struct gprefix { struct opcode pfx_no; struct opcode pfx_66; struct opcode pfx_f2; struct opcode pfx_f3; }; struct escape { struct opcode op[8]; struct opcode high[64]; }; struct instr_dual { struct opcode mod012; struct opcode mod3; }; struct mode_dual { struct opcode mode32; struct opcode mode64; }; #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a enum x86_transfer_type { X86_TRANSFER_NONE, X86_TRANSFER_CALL_JMP, X86_TRANSFER_RET, X86_TRANSFER_TASK_SWITCH, }; enum rex_bits { REX_B = 1, REX_X = 2, REX_R = 4, REX_W = 8, }; static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; unsigned reg; for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } static void invalidate_registers(struct x86_emulate_ctxt *ctxt) { ctxt->regs_dirty = 0; ctxt->regs_valid = 0; } /* * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ #define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\ X86_EFLAGS_PF|X86_EFLAGS_CF) #ifdef CONFIG_X86_64 #define ON64(x...) x #else #define ON64(x...) #endif #define EM_ASM_START(op) \ static int em_##op(struct x86_emulate_ctxt *ctxt) \ { \ unsigned long flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; \ int bytes = 1, ok = 1; \ if (!(ctxt->d & ByteOp)) \ bytes = ctxt->dst.bytes; \ switch (bytes) { #define __EM_ASM(str) \ asm("push %[flags]; popf \n\t" \ "10: " str \ "pushf; pop %[flags] \n\t" \ "11: \n\t" \ : "+a" (ctxt->dst.val), \ "+d" (ctxt->src.val), \ [flags] "+D" (flags), \ "+S" (ok) \ : "c" (ctxt->src2.val)) #define __EM_ASM_1(op, dst) \ __EM_ASM(#op " %%" #dst " \n\t") #define __EM_ASM_1_EX(op, dst) \ __EM_ASM(#op " %%" #dst " \n\t" \ _ASM_EXTABLE_TYPE_REG(10b, 11f, EX_TYPE_ZERO_REG, %%esi)) #define __EM_ASM_2(op, dst, src) \ __EM_ASM(#op " %%" #src ", %%" #dst " \n\t") #define __EM_ASM_3(op, dst, src, src2) \ __EM_ASM(#op " %%" #src2 ", %%" #src ", %%" #dst " \n\t") #define EM_ASM_END \ } \ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); \ return !ok ? emulate_de(ctxt) : X86EMUL_CONTINUE; \ } /* 1-operand, using "a" (dst) */ #define EM_ASM_1(op) \ EM_ASM_START(op) \ case 1: __EM_ASM_1(op##b, al); break; \ case 2: __EM_ASM_1(op##w, ax); break; \ case 4: __EM_ASM_1(op##l, eax); break; \ ON64(case 8: __EM_ASM_1(op##q, rax); break;) \ EM_ASM_END /* 1-operand, using "c" (src2) */ #define EM_ASM_1SRC2(op, name) \ EM_ASM_START(name) \ case 1: __EM_ASM_1(op##b, cl); break; \ case 2: __EM_ASM_1(op##w, cx); break; \ case 4: __EM_ASM_1(op##l, ecx); break; \ ON64(case 8: __EM_ASM_1(op##q, rcx); break;) \ EM_ASM_END /* 1-operand, using "c" (src2) with exception */ #define EM_ASM_1SRC2EX(op, name) \ EM_ASM_START(name) \ case 1: __EM_ASM_1_EX(op##b, cl); break; \ case 2: __EM_ASM_1_EX(op##w, cx); break; \ case 4: __EM_ASM_1_EX(op##l, ecx); break; \ ON64(case 8: __EM_ASM_1_EX(op##q, rcx); break;) \ EM_ASM_END /* 2-operand, using "a" (dst), "d" (src) */ #define EM_ASM_2(op) \ EM_ASM_START(op) \ case 1: __EM_ASM_2(op##b, al, dl); break; \ case 2: __EM_ASM_2(op##w, ax, dx); break; \ case 4: __EM_ASM_2(op##l, eax, edx); break; \ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \ EM_ASM_END /* 2-operand, reversed */ #define EM_ASM_2R(op, name) \ EM_ASM_START(name) \ case 1: __EM_ASM_2(op##b, dl, al); break; \ case 2: __EM_ASM_2(op##w, dx, ax); break; \ case 4: __EM_ASM_2(op##l, edx, eax); break; \ ON64(case 8: __EM_ASM_2(op##q, rdx, rax); break;) \ EM_ASM_END /* 2-operand, word only (no byte op) */ #define EM_ASM_2W(op) \ EM_ASM_START(op) \ case 1: break; \ case 2: __EM_ASM_2(op##w, ax, dx); break; \ case 4: __EM_ASM_2(op##l, eax, edx); break; \ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \ EM_ASM_END /* 2-operand, using "a" (dst) and CL (src2) */ #define EM_ASM_2CL(op) \ EM_ASM_START(op) \ case 1: __EM_ASM_2(op##b, al, cl); break; \ case 2: __EM_ASM_2(op##w, ax, cl); break; \ case 4: __EM_ASM_2(op##l, eax, cl); break; \ ON64(case 8: __EM_ASM_2(op##q, rax, cl); break;) \ EM_ASM_END /* 3-operand, using "a" (dst), "d" (src) and CL (src2) */ #define EM_ASM_3WCL(op) \ EM_ASM_START(op) \ case 1: break; \ case 2: __EM_ASM_3(op##w, ax, dx, cl); break; \ case 4: __EM_ASM_3(op##l, eax, edx, cl); break; \ ON64(case 8: __EM_ASM_3(op##q, rax, rdx, cl); break;) \ EM_ASM_END static int em_salc(struct x86_emulate_ctxt *ctxt) { /* * Set AL 0xFF if CF is set, or 0x00 when clear. */ ctxt->dst.val = 0xFF * !!(ctxt->eflags & X86_EFLAGS_CF); return X86EMUL_CONTINUE; } /* * XXX: inoutclob user must know where the argument is being expanded. * Using asm goto would allow us to remove _fault. */ #define asm_safe(insn, inoutclob...) \ ({ \ int _fault = 0; \ \ asm volatile("1:" insn "\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \ : [_fault] "+r"(_fault) inoutclob ); \ \ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ }) static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) { struct x86_instruction_info info = { .intercept = intercept, .rep_prefix = ctxt->rep_prefix, .modrm_mod = ctxt->modrm_mod, .modrm_reg = ctxt->modrm_reg, .modrm_rm = ctxt->modrm_rm, .src_val = ctxt->src.val64, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .src_type = ctxt->src.type, .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, .rip = ctxt->eip, .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); } static void assign_masked(ulong *dest, ulong src, ulong mask) { *dest = (*dest & ~mask) | (src & mask); } static void assign_register(unsigned long *reg, u64 val, int bytes) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (bytes) { case 1: *(u8 *)reg = (u8)val; break; case 2: *(u16 *)reg = (u16)val; break; case 4: *reg = (u32)val; break; /* 64b: zero-extend */ case 8: *reg = val; break; } } static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; } static ulong stack_mask(struct x86_emulate_ctxt *ctxt) { u16 sel; struct desc_struct ss; if (ctxt->mode == X86EMUL_MODE_PROT64) return ~0UL; ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ } static int stack_size(struct x86_emulate_ctxt *ctxt) { return (__fls(stack_mask(ctxt)) + 1) >> 3; } /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) { if (ctxt->ad_bytes == sizeof(unsigned long)) return reg; else return reg & ad_mask(ctxt); } static inline unsigned long register_address(struct x86_emulate_ctxt *ctxt, int reg) { return address_mask(ctxt, reg_read(ctxt, reg)); } static void masked_increment(ulong *reg, ulong mask, int inc) { assign_masked(reg, *reg + inc, mask); } static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { ulong *preg = reg_rmw(ctxt, reg); assign_register(preg, *preg + inc, ctxt->ad_bytes); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) { masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } static u32 desc_limit_scaled(struct desc_struct *desc) { u32 limit = get_desc_limit(desc); return desc->g ? (limit << 12) | 0xfff : limit; } static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; return ctxt->ops->get_cached_segment_base(ctxt, seg); } static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) return X86EMUL_UNHANDLEABLE; ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; return X86EMUL_PROPAGATE_FAULT; } static int emulate_db(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, DB_VECTOR, 0, false); } static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, GP_VECTOR, err, true); } static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, SS_VECTOR, err, true); } static int emulate_ud(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, UD_VECTOR, 0, false); } static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, TS_VECTOR, err, true); } static int emulate_de(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, DE_VECTOR, 0, false); } static int emulate_nm(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, NM_VECTOR, 0, false); } static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; struct desc_struct desc; ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); return selector; } static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, unsigned seg) { u16 dummy; u32 base3; struct desc_struct desc; ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) { return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48; } static inline bool emul_is_noncanonical_address(u64 la, struct x86_emulate_ctxt *ctxt, unsigned int flags) { return !ctxt->ops->is_canonical_addr(ctxt, la, flags); } /* * x86 defines three classes of vector instructions: explicitly * aligned, explicitly unaligned, and the rest, which change behaviour * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is * subject to the same check. FXSAVE and FXRSTOR are checked here too as their * 512 bytes of data must be aligned to a 16 byte boundary. */ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { u64 alignment = ctxt->d & AlignMask; if (likely(size < 16)) return 1; switch (alignment) { case Unaligned: return 1; case Aligned16: return 16; case Aligned: default: return size; } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned *max_size, unsigned size, enum x86emul_mode mode, ulong *linear, unsigned int flags) { struct desc_struct desc; bool usable; ulong la; u32 lim; u16 sel; u8 va_bits; la = seg_base(ctxt, addr.seg) + addr.ea; *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags); va_bits = ctxt_virt_addr_bits(ctxt); if (!__is_canonical_address(la, va_bits)) goto bad; *max_size = min_t(u64, ~0u, (1ull << va_bits) - la); if (size > *max_size) goto bad; break; default: *linear = la = (u32)la; usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, addr.seg); if (!usable) goto bad; /* code segment in protected mode or read-only data segment */ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) && (flags & X86EMUL_F_WRITE)) goto bad; /* unreadable code segment */ if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); if (!(desc.type & 8) && (desc.type & 4)) { /* expand-down segment */ if (addr.ea <= lim) goto bad; lim = desc.d ? 0xffffffff : 0xffff; } if (addr.ea > lim) goto bad; if (lim == 0xffffffff) *max_size = ~0u; else { *max_size = (u64)lim + 1 - addr.ea; if (size > *max_size) goto bad; } break; } if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) return emulate_ss(ctxt, 0); else return emulate_gp(ctxt, 0); } static int linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, ulong *linear) { unsigned max_size; return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear, write ? X86EMUL_F_WRITE : 0); } static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; unsigned max_size; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = dst }; if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear, X86EMUL_F_FETCH); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) { u64 efer; struct desc_struct cs; u16 selector; u32 base3; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { /* Real mode. cpu must not have long mode active */ if (efer & EFER_LMA) return X86EMUL_UNHANDLEABLE; ctxt->mode = X86EMUL_MODE_REAL; return X86EMUL_CONTINUE; } if (ctxt->eflags & X86_EFLAGS_VM) { /* Protected/VM86 mode. cpu must not have long mode active */ if (efer & EFER_LMA) return X86EMUL_UNHANDLEABLE; ctxt->mode = X86EMUL_MODE_VM86; return X86EMUL_CONTINUE; } if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) return X86EMUL_UNHANDLEABLE; if (efer & EFER_LMA) { if (cs.l) { /* Proper long mode */ ctxt->mode = X86EMUL_MODE_PROT64; } else if (cs.d) { /* 32 bit compatibility mode*/ ctxt->mode = X86EMUL_MODE_PROT32; } else { ctxt->mode = X86EMUL_MODE_PROT16; } } else { /* Legacy 32 bit / 16 bit mode */ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; } return X86EMUL_CONTINUE; } static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { return assign_eip(ctxt, dst); } static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) { int rc = emulator_recalc_and_set_mode(ctxt); if (rc != X86EMUL_CONTINUE) return rc; return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) { return assign_eip_near(ctxt, ctxt->_eip + rel); } static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear, void *data, unsigned size) { return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true); } static int linear_write_system(struct x86_emulate_ctxt *ctxt, ulong linear, void *data, unsigned int size) { return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true); } static int segmented_read_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false); } static int segmented_write_std(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, unsigned int size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false); } /* * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; unsigned size, max_size; unsigned long linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; /* * We do not know exactly how many bytes will be needed, and * __linearize is expensive, so fetch as much as possible. We * just have to avoid going beyond the 15 byte limit, the end * of the segment, or the end of the page. * * __linearize is called with size 0 so that it does not do any * boundary check itself. Instead, we use max_size to check * against op_size. */ rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear, X86EMUL_F_FETCH); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; size = min_t(unsigned, 15UL ^ cur_size, max_size); size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); /* * One instruction can only straddle two pages, * and one has been loaded at the beginning of * x86_decode_insn. So, if not enough bytes * still, we must have hit the 15-byte boundary. */ if (unlikely(size < op_size)) return emulate_gp(ctxt, 0); rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; ctxt->fetch.end += size; return X86EMUL_CONTINUE; } static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, unsigned size) { unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr; if (unlikely(done_size < size)) return __do_insn_fetch_bytes(ctxt, size - done_size); else return X86EMUL_CONTINUE; } /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ ({ _type _x; \ \ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ ({ \ rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += (_size); \ memcpy(_arr, ctxt->fetch.ptr, _size); \ ctxt->fetch.ptr += (_size); \ }) /* * Given the 'reg' portion of a ModRM byte, and a register block, return a * pointer into the block that addresses the relevant register. * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, int byteop) { void *p; int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; else p = reg_rmw(ctxt, modrm_reg); return p; } static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, u16 *size, unsigned long *address, int op_bytes) { int rc; if (op_bytes == 2) op_bytes = 3; *address = 0; rc = segmented_read_std(ctxt, addr, size, 2); if (rc != X86EMUL_CONTINUE) return rc; addr.ea += 2; rc = segmented_read_std(ctxt, addr, address, op_bytes); return rc; } EM_ASM_2(add); EM_ASM_2(or); EM_ASM_2(adc); EM_ASM_2(sbb); EM_ASM_2(and); EM_ASM_2(sub); EM_ASM_2(xor); EM_ASM_2(cmp); EM_ASM_2(test); EM_ASM_2(xadd); EM_ASM_1SRC2(mul, mul_ex); EM_ASM_1SRC2(imul, imul_ex); EM_ASM_1SRC2EX(div, div_ex); EM_ASM_1SRC2EX(idiv, idiv_ex); EM_ASM_3WCL(shld); EM_ASM_3WCL(shrd); EM_ASM_2W(imul); EM_ASM_1(not); EM_ASM_1(neg); EM_ASM_1(inc); EM_ASM_1(dec); EM_ASM_2CL(rol); EM_ASM_2CL(ror); EM_ASM_2CL(rcl); EM_ASM_2CL(rcr); EM_ASM_2CL(shl); EM_ASM_2CL(shr); EM_ASM_2CL(sar); EM_ASM_2W(bsf); EM_ASM_2W(bsr); EM_ASM_2W(bt); EM_ASM_2W(bts); EM_ASM_2W(btr); EM_ASM_2W(btc); EM_ASM_2R(cmp, cmp_r); static int em_bsf_c(struct x86_emulate_ctxt *ctxt) { /* If src is zero, do not writeback, but update flags */ if (ctxt->src.val == 0) ctxt->dst.type = OP_NONE; return em_bsf(ctxt); } static int em_bsr_c(struct x86_emulate_ctxt *ctxt) { /* If src is zero, do not writeback, but update flags */ if (ctxt->src.val == 0) ctxt->dst.type = OP_NONE; return em_bsr(ctxt); } static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) { return __emulate_cc(flags, condition & 0xf); } static void fetch_register_operand(struct operand *op) { switch (op->bytes) { case 1: op->val = *(u8 *)op->addr.reg; break; case 2: op->val = *(u16 *)op->addr.reg; break; case 4: op->val = *(u32 *)op->addr.reg; break; case 8: op->val = *(u64 *)op->addr.reg; break; } op->orig_val = op->val; } static int em_fninit(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); kvm_fpu_get(); asm volatile("fninit"); kvm_fpu_put(); return X86EMUL_CONTINUE; } static int em_fnstcw(struct x86_emulate_ctxt *ctxt) { u16 fcw; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); kvm_fpu_get(); asm volatile("fnstcw %0": "+m"(fcw)); kvm_fpu_put(); ctxt->dst.val = fcw; return X86EMUL_CONTINUE; } static int em_fnstsw(struct x86_emulate_ctxt *ctxt) { u16 fsw; if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); kvm_fpu_get(); asm volatile("fnstsw %0": "+m"(fsw)); kvm_fpu_put(); ctxt->dst.val = fsw; return X86EMUL_CONTINUE; } static void __decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, int reg) { if ((ctxt->d & Avx) && ctxt->op_bytes == 32) { op->type = OP_YMM; op->bytes = 32; op->addr.xmm = reg; kvm_read_avx_reg(reg, &op->vec_val2); return; } if (ctxt->d & (Avx|Sse)) { op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; kvm_read_sse_reg(reg, &op->vec_val); return; } if (ctxt->d & Mmx) { reg &= 7; op->type = OP_MM; op->bytes = 8; op->addr.mm = reg; return; } op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); fetch_register_operand(op); } static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { unsigned int reg; if (ctxt->d & ModRM) reg = ctxt->modrm_reg; else reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0); __decode_register_operand(ctxt, op, reg); } static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) { if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) ctxt->modrm_seg = VCPU_SREG_SS; } static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { u8 sib; int index_reg, base_reg, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0); index_reg = (ctxt->rex_bits & REX_X ? 8 : 0); base_reg = (ctxt->rex_bits & REX_B ? 8 : 0); ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { __decode_register_operand(ctxt, op, ctxt->modrm_rm); return rc; } op->type = OP_MEM; if (ctxt->ad_bytes == 2) { unsigned bx = reg_read(ctxt, VCPU_REGS_RBX); unsigned bp = reg_read(ctxt, VCPU_REGS_RBP); unsigned si = reg_read(ctxt, VCPU_REGS_RSI); unsigned di = reg_read(ctxt, VCPU_REGS_RDI); /* 16-bit ModR/M decode. */ switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) modrm_ea += insn_fetch(u16, ctxt); break; case 1: modrm_ea += insn_fetch(s8, ctxt); break; case 2: modrm_ea += insn_fetch(u16, ctxt); break; } switch (ctxt->modrm_rm) { case 0: modrm_ea += bx + si; break; case 1: modrm_ea += bx + di; break; case 2: modrm_ea += bp + si; break; case 3: modrm_ea += bp + di; break; case 4: modrm_ea += si; break; case 5: modrm_ea += di; break; case 6: if (ctxt->modrm_mod != 0) modrm_ea += bp; break; case 7: modrm_ea += bx; break; } if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 || (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0)) ctxt->modrm_seg = VCPU_SREG_SS; modrm_ea = (u16)modrm_ea; } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { sib = insn_fetch(u8, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); /* Increment ESP on POP [ESP] */ if ((ctxt->d & IncSP) && base_reg == VCPU_REGS_RSP) modrm_ea += ctxt->op_bytes; } if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { modrm_ea += insn_fetch(s32, ctxt); if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; } else { base_reg = ctxt->modrm_rm; modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); } switch (ctxt->modrm_mod) { case 1: modrm_ea += insn_fetch(s8, ctxt); break; case 2: modrm_ea += insn_fetch(s32, ctxt); break; } } op->addr.mem.ea = modrm_ea; if (ctxt->ad_bytes != 8) ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; done: return rc; } static int decode_abs(struct x86_emulate_ctxt *ctxt, struct operand *op) { int rc = X86EMUL_CONTINUE; op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: op->addr.mem.ea = insn_fetch(u16, ctxt); break; case 4: op->addr.mem.ea = insn_fetch(u32, ctxt); break; case 8: op->addr.mem.ea = insn_fetch(u64, ctxt); break; } done: return rc; } static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) { long sv = 0, mask; if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { mask = ~((long)ctxt->dst.bytes * 8 - 1); if (ctxt->src.bytes == 2) sv = (s16)ctxt->src.val & (s16)mask; else if (ctxt->src.bytes == 4) sv = (s32)ctxt->src.val & (s32)mask; else sv = (s64)ctxt->src.val & (s64)mask; ctxt->dst.addr.mem.ea = address_mask(ctxt, ctxt->dst.addr.mem.ea + (sv >> 3)); } /* only subword offset */ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; } static int read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *dest, unsigned size) { int rc; struct read_cache *mc = &ctxt->mem_read; if (mc->pos < mc->end) goto read_cached; if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += size; read_cached: memcpy(dest, mc->data + mc->pos, size); mc->pos += size; return X86EMUL_CONTINUE; } static int segmented_read(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, false, &linear); if (rc != X86EMUL_CONTINUE) return rc; return read_emulated(ctxt, linear, data, size); } static int segmented_write(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, const void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->write_emulated(ctxt, linear, data, size, &ctxt->exception); } static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, const void *orig_data, const void *data, unsigned size) { int rc; ulong linear; rc = linearize(ctxt, addr, size, true, &linear); if (rc != X86EMUL_CONTINUE) return rc; return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, size, &ctxt->exception); } static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, unsigned int size, unsigned short port, void *dest) { struct read_cache *rc = &ctxt->io_read; if (rc->pos == rc->end) { /* refill pio read ahead */ unsigned int in_page, n; unsigned int count = ctxt->rep_prefix ? address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; in_page = (ctxt->eflags & X86_EFLAGS_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); if (n == 0) n = 1; rc->pos = rc->end = 0; if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } if (ctxt->rep_prefix && (ctxt->d & String) && !(ctxt->eflags & X86_EFLAGS_DF)) { ctxt->dst.data = rc->data + rc->pos; ctxt->dst.type = OP_MEM_STR; ctxt->dst.count = (rc->end - rc->pos) / size; rc->pos = rc->end; } else { memcpy(dest, rc->data + rc->pos, size); rc->pos += size; } return 1; } static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, u16 index, struct desc_struct *desc) { struct desc_ptr dt; ulong addr; ctxt->ops->get_idt(ctxt, &dt); if (dt.size < index * 8 + 7) return emulate_gp(ctxt, index << 3 | 0x2); addr = dt.address + index * 8; return linear_read_system(ctxt, addr, desc, sizeof(*desc)); } static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { const struct x86_emulate_ops *ops = ctxt->ops; u32 base3 = 0; if (selector & 1 << 2) { struct desc_struct desc; u16 sel; memset(dt, 0, sizeof(*dt)); if (!ops->get_segment(ctxt, &sel, &desc, &base3, VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->address = get_desc_base(&desc) | ((u64)base3 << 32); } else ops->get_gdt(ctxt, dt); } static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; ulong addr; get_descriptor_table_ptr(ctxt, selector, &dt); if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; #ifdef CONFIG_X86_64 if (addr >> 32 != 0) { u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (!(efer & EFER_LMA)) addr &= (u32)-1; } #endif *desc_addr_p = addr; return X86EMUL_CONTINUE; } /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc, ulong *desc_addr_p) { int rc; rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); if (rc != X86EMUL_CONTINUE) return rc; return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc)); } /* allowed just for 8 bytes segments */ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc) { int rc; ulong addr; rc = get_descriptor_ptr(ctxt, selector, &addr); if (rc != X86EMUL_CONTINUE) return rc; return linear_write_system(ctxt, addr, desc, sizeof(*desc)); } static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl) { const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET; u64 efer = 0, cet = 0, ssp = 0; if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET)) return false; if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer)) return true; /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */ if (!(efer & EFER_LMA)) return false; if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet)) return true; if (!(cet & CET_SHSTK_EN)) return false; if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp)) return true; /* * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode. */ return ssp >> 32; } static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; u8 dpl, rpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ ulong desc_addr; int ret; u16 dummy; u32 base3 = 0; memset(&seg_desc, 0, sizeof(seg_desc)); if (ctxt->mode == X86EMUL_MODE_REAL) { /* set real mode segment descriptor (keep limit etc. for * unreal mode) */ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); set_desc_base(&seg_desc, selector << 4); goto load; } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) { /* VM86 needs a clean new segment descriptor */ set_desc_base(&seg_desc, selector << 4); set_desc_limit(&seg_desc, 0xffff); seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; seg_desc.dpl = 3; goto load; } rpl = selector & 3; /* TR should be in GDT only */ if (seg == VCPU_SREG_TR && (selector & (1 << 2))) goto exception; /* NULL selector is not valid for TR, CS and (except for long mode) SS */ if (null_selector) { if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR) goto exception; if (seg == VCPU_SREG_SS) { if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl) goto exception; /* * ctxt->ops->set_segment expects the CPL to be in * SS.DPL, so fake an expand-up 32-bit data segment. */ seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; seg_desc.dpl = cpl; seg_desc.d = 1; seg_desc.g = 1; } /* Skip all following checks */ goto load; } ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; err_code = selector & 0xfffc; err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : GP_VECTOR; /* can't load system descriptor into segment selector */ if (seg <= VCPU_SREG_GS && !seg_desc.s) { if (transfer == X86_TRANSFER_CALL_JMP) return X86EMUL_UNHANDLEABLE; goto exception; } dpl = seg_desc.dpl; switch (seg) { case VCPU_SREG_SS: /* * segment is not a writable data segment or segment * selector's RPL != CPL or DPL != CPL */ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) goto exception; break; case VCPU_SREG_CS: /* * KVM uses "none" when loading CS as part of emulating Real * Mode exceptions and IRET (handled above). In all other * cases, loading CS without a control transfer is a KVM bug. */ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE)) goto exception; if (!(seg_desc.type & 8)) goto exception; if (transfer == X86_TRANSFER_RET) { /* RET can never return to an inner privilege level. */ if (rpl < cpl) goto exception; /* Outer-privilege level return is not implemented */ if (rpl > cpl) return X86EMUL_UNHANDLEABLE; } if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) { if (seg_desc.type & 4) { /* conforming */ if (dpl > rpl) goto exception; } else { /* nonconforming */ if (dpl != rpl) goto exception; } } else { /* X86_TRANSFER_CALL_JMP */ if (seg_desc.type & 4) { /* conforming */ if (dpl > cpl) goto exception; } else { /* nonconforming */ if (rpl > cpl || dpl != cpl) goto exception; } } /* in long-mode d/b must be clear if l is set */ if (seg_desc.d && seg_desc.l) { u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) goto exception; } if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) { err_code = 0; goto exception; } /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; break; case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) goto exception; break; default: /* DS, ES, FS, or GS */ /* * segment is not a data or readable code segment or * ((segment is a data or nonconforming code segment) * and ((RPL > DPL) or (CPL > DPL))) */ if ((seg_desc.type & 0xa) == 0x8 || (((seg_desc.type & 0xc) != 0xc) && (rpl > dpl || cpl > dpl))) goto exception; break; } if (!seg_desc.p) { err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; goto exception; } if (seg_desc.s) { /* mark segment as accessed */ if (!(seg_desc.type & 1)) { seg_desc.type |= 1; ret = write_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3)); if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | ((u64)base3 << 32), ctxt, X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, err_code); } if (seg == VCPU_SREG_TR) { old_desc = seg_desc; seg_desc.type |= 2; /* busy */ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, sizeof(seg_desc), &ctxt->exception); if (ret != X86EMUL_CONTINUE) return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); if (desc) *desc = seg_desc; return X86EMUL_CONTINUE; exception: return emulate_exception(ctxt, err_vec, err_code, true); } static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); /* * None of MOV, POP and LSS can load a NULL selector in CPL=3, but * they can load it at CPL<3 (Intel's manual says only LSS can, * but it's wrong). * * However, the Intel manual says that putting IST=1/DPL=3 in * an interrupt gate will result in SS=3 (the AMD manual instead * says it doesn't), so allow SS=3 in __load_segment_descriptor * and only forbid it here. */ if (seg == VCPU_SREG_SS && selector == 3 && ctxt->mode == X86EMUL_MODE_PROT64) return emulate_exception(ctxt, GP_VECTOR, 0, true); return __load_segment_descriptor(ctxt, selector, seg, cpl, X86_TRANSFER_NONE, NULL); } static void write_register_operand(struct operand *op) { return assign_register(op->addr.reg, op->val, op->bytes); } static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { switch (op->type) { case OP_REG: write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) return segmented_cmpxchg(ctxt, op->addr.mem, &op->orig_val, &op->val, op->bytes); else return segmented_write(ctxt, op->addr.mem, &op->val, op->bytes); case OP_MEM_STR: return segmented_write(ctxt, op->addr.mem, op->data, op->bytes * op->count); case OP_XMM: if (!(ctxt->d & Avx)) { kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; } /* full YMM write but with high bytes cleared */ memset(op->valptr + 16, 0, 16); fallthrough; case OP_YMM: kvm_write_avx_reg(op->addr.xmm, &op->vec_val2); break; case OP_MM: kvm_write_mmx_reg(op->addr.mm, &op->mm_val); break; case OP_NONE: /* no writeback */ break; default: break; } return X86EMUL_CONTINUE; } static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len) { struct segmented_address addr; rsp_increment(ctxt, -len); addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, len); } static int em_push(struct x86_emulate_ctxt *ctxt) { /* Disable writeback. */ ctxt->dst.type = OP_NONE; return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; struct segmented_address addr; addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, len); return rc; } static int em_pop(struct x86_emulate_ctxt *ctxt) { return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); } static int emulate_popf(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; unsigned long val = 0; unsigned long change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF | X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: case X86EMUL_MODE_PROT32: case X86EMUL_MODE_PROT16: if (cpl == 0) change_mask |= X86_EFLAGS_IOPL; if (cpl <= iopl) change_mask |= X86_EFLAGS_IF; break; case X86EMUL_MODE_VM86: if (iopl < 3) return emulate_gp(ctxt, 0); change_mask |= X86_EFLAGS_IF; break; default: /* real mode */ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF); break; } *(unsigned long *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); return rc; } static int em_popf(struct x86_emulate_ctxt *ctxt) { ctxt->dst.type = OP_REG; ctxt->dst.addr.reg = &ctxt->eflags; ctxt->dst.bytes = ctxt->op_bytes; return emulate_popf(ctxt, &ctxt->dst.