| 50 517 900 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* fs/ internal definitions * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ struct super_block; struct file_system_type; struct iomap; struct iomap_ops; struct linux_binprm; struct path; struct mount; struct shrink_control; struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; /* * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); #else static inline void bdev_cache_init(void) { } #endif /* CONFIG_BLOCK */ /* * buffer.c */ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c */ extern void __init chrdev_init(void); /* * fs_context.c */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); /* * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); int do_symlinkat(struct filename *from, int newdfd, struct filename *to); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags); /* * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write_file(struct file *); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); /* * fs_struct.c */ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { put_write_access(file->f_inode); __mnt_drop_write(file->f_path.mnt); } } /* * super.c */ extern int reconfigure_super(struct fs_context *); extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* * Prepare superblock for changing its read-only state (i.e., either remount * read-write superblock read-only or vice versa). After this function returns * mnt_is_readonly() will return true for any mount of the superblock if its * caller is able to observe any changes done by the remount. This holds until * sb_end_ro_state_change() is called. */ static inline void sb_start_ro_state_change(struct super_block *sb) { WRITE_ONCE(sb->s_readonly_remount, 1); /* * For RO->RW transition, the barrier pairs with the barrier in * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in * __mnt_want_write() before the mnt_is_readonly() check. The barrier * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already * cleared, it will see s_readonly_remount set. */ smp_wmb(); } /* * Ends section changing read-only state of the superblock. After this function * returns if mnt_is_readonly() returns false, the caller will be able to * observe all the changes remount did to the superblock. */ static inline void sb_end_ro_state_change(struct super_block *sb) { /* * This barrier provides release semantics that pairs with * the smp_rmb() acquire semantics in mnt_is_readonly(). * This barrier pair ensure that when mnt_is_readonly() sees * 0 for sb->s_readonly_remount, it will also see all the * preceding flag changes that were made during the RO state * change. */ smp_wmb(); WRITE_ONCE(sb->s_readonly_remount, 0); } /* * open.c */ struct open_flags { int open_flag; umode_t mode; int acc_mode; int intent; int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); extern struct file *__close_fd_get_file(unsigned int fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); int chown_common(const struct path *path, uid_t user, gid_t group); extern int vfs_open(const struct path *, struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); void lock_two_inodes(struct inode *inode1, struct inode *inode2, unsigned subclass1, unsigned subclass2); /* * fs-writeback.c */ extern long get_nr_dirty_inodes(void); void invalidate_inodes(struct super_block *sb); /* * dcache.c */ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); /* * pipe.c */ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; /* * fs/stat.c: */ int getname_statx_lookup_flags(int flags); int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); /* * fs/splice.c: */ long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); /* * fs/xattr.c: */ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; struct xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; void __user *value; }; void *kvalue; size_t size; /* Attribute name */ struct xattr_name *kname; unsigned int flags; }; ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size); ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size); #else static inline int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { return -EOPNOTSUPP; } static inline ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { return -EOPNOTSUPP; } #endif ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); /* * fs/attr.c */ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); |
| 1415 1415 1415 1415 1415 296 296 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 | // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { raw_spin_lock(&cpuctx->ctx.lock); if (ctx) raw_spin_lock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) raw_spin_unlock(&ctx->lock); raw_spin_unlock(&cpuctx->ctx.lock); } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); struct perf_event_context *perf_cpu_task_ctx(void) { lockdep_assert_irqs_disabled(); return this_cpu_ptr(&perf_cpu_context)->task_ctx; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto again; } func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, /* see ctx_resched() for details */ EVENT_CPU = 0x8, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; /* * perf_sched_events : >0 events exist */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artifically low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespecive of what the group member states are. This results in * __perf_effective_state(). * * A futher ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) static void perf_ctx_disable(struct perf_event_context *ctx) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) perf_pmu_disable(pmu_ctx->pmu); } static void perf_ctx_enable(struct perf_event_context *ctx) { struct perf_event_pmu_context *pmu_ctx; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ if (info->active) __update_cgrp_time(info, perf_clock(), true); } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!cgrp) return; WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; /* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled. */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx); ctx_sched_out(&cpuctx->ctx, EVENT_ALL); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out() */ cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ ctx_sched_in(&cpuctx->ctx, EVENT_ALL); perf_ctx_enable(&cpuctx->ctx); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficent space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; struct fd f = fdget(fd); int ret = 0; if (!f.file) return -EBADF; css = css_tryget_online_from_dir(f.file->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; } ret = perf_cgroup_ensure_storage(event, css); if (ret) goto out; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } out: fdput(f); return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (ctx->nr_cgroups++) return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } static void perf_cgroup_switch(struct task_struct *task) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); rotations = perf_rotate_context(cpc); raw_spin_lock(&cpc->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpc->hrtimer_interval); else cpc->hrtimer_active = 0; raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { struct hrtimer *timer = &cpc->hrtimer; struct pmu *pmu = cpc->epc.pmu; u64 interval; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); if (!cpc->hrtimer_active) { cpc->hrtimer_active = 1; hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void *alloc_task_ctx_data(struct pmu *pmu) { if (pmu->task_ctx_cache) return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); return NULL; } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { if (pmu->task_ctx_cache && task_ctx_data) kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); lockdep_assert_held(&ctx->lock); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } static inline struct cgroup *event_cgroup(const struct perf_event *event) { struct cgroup *cgroup = NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp) cgroup = event->cgrp->css.cgroup; #endif return cgroup; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static __always_inline int perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, const struct cgroup *left_cgroup, const u64 left_group_index, const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1; } #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first. */ return -1; } if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first. */ return 1; } /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1; return 1; } } #endif if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1; return 0; } #define __node_2_pe(node) \ rb_entry((node), struct perf_event, group_node) static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; struct pmu *pmu; struct cgroup *cgroup; }; static inline int __group_cmp(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); } static inline int __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), b->group_index, b); } /* * Insert @event into @groups' tree; using * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { event->group_index = ++groups->index; rb_add(&event->group_node, &groups->tree, __group_less); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; node = rb_find_first(&key, &groups->tree, __group_cmp); if (node) return __node_2_pe(node); return NULL; } static struct perf_event * perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; next = rb_next_match(&key, &event->group_node, __group_cmp); if (next) return __node_2_pe(next); return NULL; } #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ event; event = perf_event_groups_next(event, pmu)) /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; event->pmu_ctx->nr_events++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } size += entry * nr; event->read_size = size; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) size += sizeof(data->code_page_size); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { __perf_event_read_size(event, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } static bool perf_event_validate_size(struct perf_event *event) { /* * The values computed here will be over-written when we actually * attach the event. */ __perf_event_read_size(event, event->group_leader->nr_siblings + 1); __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); perf_event__id_header_size(event); /* * Sum the lot; should not exceed the 64k limit we have on records. * Conservative limit to allow for callchains and other variable fields. */ if (event->read_size + event->header_size + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) return false; return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement (move_group) in * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); /* * If event was in error state, then keep it * that way, otherwise bogus counts will be * returned on read(). The only way to get out * of error state is by explicit re-enabling * of the event */ if (event->state > PERF_EVENT_STATE_OFF) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); } ctx->generation++; event->pmu_ctx->nr_events--; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event->group_leader) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return !!event->attr.aux_output || !!event->attr.aux_sample_size; } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { return event->attr.pinned ? &event->pmu_ctx->pinned_active : &event->pmu_ctx->flexible_active; } /* * Events that have PERF_EV_CAP_SIBLING require being part of a group and * cannot exist on their own, schedule them out and move them into the ERROR * state. Also see _perf_event_enable(), it will not be able to recover * this ERROR state. */ static inline void perf_remove_sibling_event(struct perf_event *event) { event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static void sync_child_event(struct perf_event *child_event); static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; if (!(event->attach_state & PERF_ATTACH_CHILD)) return; event->attach_state &= ~PERF_ATTACH_CHILD; if (WARN_ON_ONCE(!parent_event)) return; lockdep_assert_held(&parent_event->child_mutex); sync_child_event(event); list_del_init(&event->child_list); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event); } static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (event->pending_disable) { event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } if (event->pending_sigtrap) { bool dec = true; event->pending_sigtrap = 0; if (state != PERF_EVENT_STATE_OFF && !event->pending_work) { event->pending_work = 1; dec = false; WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); task_work_add(current, &event->pending_task, TWA_RESUME); } if (dec) local_dec(&event->ctx->nr_pending); } perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, ctx); } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, false); } /* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ if (flags & DETACH_DEAD) event->pending_disable = 1; event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); if (flags & DETACH_DEAD) event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); /* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) { __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_pmu_disable(event->pmu_ctx->pmu); if (event == event->group_leader) group_sched_out(event, ctx); else event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); perf_pmu_enable(event->pmu_ctx->pmu); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { perf_log_throttle(event, 1); event->hw.interrupts = 0; } perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, ctx); } event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); if (ctx) ctx_sched_in(ctx, EVENT_PINNED); ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); if (ctx) ctx_sched_in(ctx, EVENT_FLEXIBLE); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ /* * XXX: ctx_resched() reschedule entire perf_event_context while adding new * event to the context or enabling existing event in the context. We can * probably optimize it by rescheduling only affected pmu_ctx. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; event_type &= EVENT_ALL; perf_ctx_disable(&cpuctx->ctx); if (task_ctx) { perf_ctx_disable(task_ctx); task_ctx_sched_out(task_ctx, event_type); } /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) ctx_sched_out(&cpuctx->ctx, event_type); else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx); perf_ctx_enable(&cpuctx->ctx); if (task_ctx) perf_ctx_enable(task_ctx); } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_sched_out(ctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; if (ctx->is_active) ctx_sched_out(ctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) { ctx_sched_in(ctx, EVENT_TIME); return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { ctx_sched_in(ctx, EVENT_TIME); return; } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_event_refresh(struct perf_event *event, int refresh) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_refresh(event, refresh); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_refresh); static int perf_event_modify_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); if (!bp->attr.disabled) _perf_event_enable(bp); return err; } /* * Copy event-type-independent attributes that may be modified. */ static void perf_event_modify_copy_attr(struct perf_event_attr *to, const struct perf_event_attr *from) { to->sig_data = from->sig_data; } static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { int (*func)(struct perf_event *, struct perf_event_attr *); struct perf_event *child; int err; if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: func = perf_event_modify_breakpoint; break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); /* * Event-type-independent attributes must be copied before event-type * modification, which will validate that final attributes match the * source attributes after all relevant attributes have been copied. */ perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; } out: mutex_unlock(&event->child_mutex); return err; } static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !ctx->is_active) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } if (!event_type) return; perf_pmu_disable(pmu); if (event_type & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) group_sched_out(event, ctx); } if (event_type & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) group_sched_out(event, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed. */ pmu_ctx->rotate_necessary = 0; } perf_pmu_enable(pmu); } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context(). */ WARN_ON_ONCE(ctx->is_active); if (ctx->task) WARN_ON_ONCE(cpuctx->task_ctx); return; } /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events. */ if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) __pmu_ctx_sched_out(pmu_ctx, is_active); } /* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { lockdep_assert_held(&ctx1->lock); lockdep_assert_held(&ctx2->lock); /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; /* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1; /* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1; /* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context(). */ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen) return 1; /* Unmatched */ return 0; } static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { u64 value; if (!event->attr.inherit_stat) return; /* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it. */ if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ value = local64_read(&next_event->count); value = local64_xchg(&event->count, value); local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ perf_event_update_userpage(event); perf_event_update_userpage(next_event); } static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; if (!ctx->nr_stat) return; update_context_time(ctx); event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); next_event = list_first_entry(&next_ctx->event_list, struct perf_event, event_entry); while (&event->event_entry != &ctx->event_list && &next_event->event_entry != &next_ctx->event_list) { __perf_event_sync_stat(event, next_event); event = list_next_entry(event, event_entry); next_event = list_next_entry(next_event, event_entry); } } #define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ pos2 = list_first_entry(head2, typeof(*pos2), member); \ !list_entry_is_head(pos1, head1, member) && \ !list_entry_is_head(pos2, head2, member); \ pos1 = list_next_entry(pos1, member), \ pos2 = list_next_entry(pos2, member)) static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, struct perf_event_context *next_ctx) { struct perf_event_pmu_context *prev_epc, *next_epc; if (!prev_ctx->nr_task_data) return; double_list_for_each_entry(prev_epc, next_epc, &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, pmu_ctx_entry) { if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) continue; /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such * synchronization see implementation details of Intel * LBR call stack data profiling; */ if (prev_epc->pmu->swap_task_ctx) prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); else swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); } } static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); } } static void perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; int do_switch = 1; if (likely(!ctx)) return; rcu_read_lock(); next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; parent = rcu_dereference(ctx->parent_ctx); next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { perf_ctx_disable(ctx); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || local_read(&next_ctx->nr_pending)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); goto inside_switch; } WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); perf_ctx_enable(ctx); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task and ctx->task_ctx_data are immaterial * since those values are always verified under * ctx->lock which we're now holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; perf_event_sync_stat(ctx, next_ctx); } raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } unlock: rcu_read_unlock(); if (do_switch) { raw_spin_lock(&ctx->lock); perf_ctx_disable(ctx); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, EVENT_ALL); perf_ctx_enable(ctx); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); barrier(); if (!--cpc->sched_cb_usage) list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); barrier(); this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; pmu = cpc->epc.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_pmu_context *cpc; /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ if (prev == next || cpuctx->task_ctx) return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event. */ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } static void swap_ptr(void *l, void *r) { void **lp = l, **rp = r; swap(*lp, *rp); } static const struct min_heap_callbacks perf_min_heap = { .elem_size = sizeof(struct perf_event *), .less = perf_less_group_idx, .swp = swap_ptr, }; static void __heap_add(struct min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; if (event) { itrs[heap->nr] = event; heap->nr++; } } static void __link_epc(struct perf_event_pmu_context *pmu_ctx) { struct perf_cpu_pmu_context *cpc; if (!pmu_ctx->ctx->task) return; cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct min_heap event_heap; struct perf_event **evt; int ret; if (pmu->filter && pmu->filter(pmu, cpu)) return 0; if (!ctx->task) { cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, }; lockdep_assert_held(&cpuctx->ctx.lock); #ifdef CONFIG_CGROUP_PERF if (cpuctx->cgrp) css = &cpuctx->cgrp->css; #endif } else { event_heap = (struct min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif if (event_heap.nr) { __link_epc((*evt)->pmu_ctx); perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } min_heapify_all(&event_heap, &perf_min_heap); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heapify(&event_heap, 0, &perf_min_heap); else min_heap_pop(&event_heap, &perf_min_heap); } return 0; } /* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges. */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); perf_event_update_userpage(event); return true; } static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; if (!event_update_userpage(group_event)) return; for_each_sibling_event(event, group_event) event_update_userpage(event); } static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; if (!event_filter_match(event)) return 0; if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { struct perf_cpu_pmu_context *cpc; event->pmu_ctx->rotate_necessary = 1; cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } return 0; } static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; if (pmu) { visit_groups_merge(ctx, &ctx->pinned_groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } else { list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { can_add_hw = 1; visit_groups_merge(ctx, &ctx->pinned_groups, smp_processor_id(), pmu_ctx->pmu, merge_sched_in, &can_add_hw); } } } static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; if (pmu) { visit_groups_merge(ctx, &ctx->flexible_groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } else { list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { can_add_hw = 1; visit_groups_merge(ctx, &ctx->flexible_groups, smp_processor_id(), pmu_ctx->pmu, merge_sched_in, &can_add_hw); } } } static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { ctx_flexible_sched_in(ctx, pmu); } static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; if (!(is_active & EVENT_TIME)) { /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); } is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, NULL); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, NULL); } static void perf_event_context_sched_in(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto rcu_unlock; if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx); perf_ctx_sched_task_cb(ctx, true); perf_ctx_enable(ctx); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context(). */ if (!ctx->nr_events) goto unlock; perf_ctx_disable(ctx); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); } perf_event_sched_in(cpuctx, ctx); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx); perf_ctx_enable(ctx); unlock: perf_ctx_unlock(cpuctx, ctx); rcu_unlock: rcu_read_unlock(); } /* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running. */ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; u64 sec = NSEC_PER_SEC; u64 divisor, dividend; int count_fls, nsec_fls, frequency_fls, sec_fls; count_fls = fls64(count); nsec_fls = fls64(nsec); frequency_fls = fls64(frequency); sec_fls = 30; /* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq * */ /* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ a##_fls--; \ } else { \ b >>= 1; \ b##_fls--; \ } \ } while (0) /* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division. */ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); REDUCE_FLS(sec, count); } if (count_fls + sec_fls > 64) { divisor = nsec * frequency; while (count_fls + sec_fls > 64) { REDUCE_FLS(count, sec); divisor >>= 1; } dividend = count * sec; } else { dividend = count * sec; while (nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); dividend >>= 1; } divisor = nsec * frequency; } if (!divisor) return dividend; return div64_u64(dividend, divisor); } static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { if (disable) event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); if (disable) event->pmu->start(event, PERF_EF_RELOAD); } } /* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; /* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; perf_pmu_disable(event->pmu); hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) goto next; /* * stop the event and update event->count */ event->pmu->stop(event, PERF_EF_UPDATE); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; /* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice. */ if (delta > 0) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); next: perf_pmu_enable(event->pmu); } raw_spin_unlock(&ctx->lock); } /* * Move @event to the tail of the @ctx's elegible events. */ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (ctx->rotate_disable) return; perf_event_groups_delete(&ctx->flexible_groups, event); perf_event_groups_insert(&ctx->flexible_groups, event); } /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; struct rb_node *node; struct rb_root *tree; struct __group_key key = { .pmu = pmu_ctx->pmu, }; /* pick the first active flexible event */ event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); if (event) goto out; /* if no active flexible event, pick the first event */ tree = &pmu_ctx->ctx->flexible_groups.tree; if (!pmu_ctx->ctx->task) { key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); goto out; } key.cpu = -1; node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) { event = __node_2_pe(node); goto out; } key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ pmu_ctx->rotate_necessary = 0; return event; } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; int cpu_rotate, task_rotate; struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ cpu_epc = &cpc->epc; pmu = cpu_epc->pmu; task_epc = cpc->task_epc; cpu_rotate = cpu_epc->rotate_necessary; task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); if (task_rotate) task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ if (task_event || (task_epc && cpu_event)) { update_context_time(task_epc->ctx); __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); } if (cpu_event) { update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); __pmu_ctx_sched_in(&cpuctx->ctx, pmu); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) __pmu_ctx_sched_in(task_epc->ctx, pmu); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; } void perf_event_task_tick(void) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_adjust_freq_unthr_context(ctx, !!throttled); rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, struct perf_event_context *ctx) { if (!event->attr.enable_on_exec) return 0; event->attr.enable_on_exec = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; if (!ctx->nr_events) goto out; cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { ctx_sched_in(ctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); out: local_irq_restore(flags); if (clone_ctx) put_ctx(clone_ctx); } static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx); /* * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ static void perf_event_remove_on_exec(struct perf_event_context *ctx) { struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) goto unlock; list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { if (!event->attr.remove_on_exec) continue; if (!is_kernel_event(event)) perf_remove_from_owner(event); modified = true; perf_event_exit_event(event, ctx); } raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); if (clone_ctx) put_ctx(clone_ctx); } struct perf_read_data { struct perf_event *event; bool group; int ret; }; static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { u16 local_pkg, event_pkg; if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { int local_cpu = smp_processor_id(); event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); if (event_pkg == local_pkg) return local_cpu; } return event_cpu; } /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case * event->count would have been updated to a recent sample * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; raw_spin_lock(&ctx->lock); if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_event_update_time(event); if (data->group) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; if (!data->group) { pmu->read(event); data->ret = 0; goto unlock; } pmu->start_txn(pmu, PERF_PMU_TXN_READ); pmu->read(event); for_each_sibling_event(sub, event) { if (sub->state == PERF_EVENT_STATE_ACTIVE) { /* * Use sibling's PMU rather than @event's since * sibling could be on different (eg: software) PMU. */ sub->pmu->read(sub); } } data->ret = pmu->commit_txn(pmu); unlock: raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event) { return local64_read(&event->count) + atomic64_read(&event->child_count); } static void calc_timer_values(struct perf_event *event, u64 *now, u64 *enabled, u64 *running) { u64 ctx_time; *now = perf_clock(); ctx_time = perf_event_time_now(event, *now); __perf_update_times(event, ctx_time, enabled, running); } /* * NMI-safe method to read a local event, that is an event that * is: * - either for the current task, or for this CPU * - does not have inherit set, for inherited task events * will not be local and we cannot read them atomically * - must not have a pmu::count method */ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs). */ local_irq_save(flags); /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ if (event->attr.inherit) { ret = -EOPNOTSUPP; goto out; } /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { ret = -EINVAL; goto out; } /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && event->cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ if (event->attr.pinned && event->oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ if (event->oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); if (enabled || running) { u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) *running = __running; } out: local_irq_restore(flags); return ret; } static int perf_event_read(struct perf_event *event, bool group) { enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ again: if (state == PERF_EVENT_STATE_ACTIVE) { struct perf_read_data data; /* * Orders the ->state and ->oncpu loads such that if we see * ACTIVE we must also see the right ->oncpu. * * Matches the smp_wmb() from event_sched_in(). */ smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; data = (struct perf_read_data){ .event = event, .group = group, .ret = 0, }; preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); /* * Purposely ignore the smp_call_function_single() return * value. * * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this. */ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); state = event->state; if (state != PERF_EVENT_STATE_INACTIVE) { raw_spin_unlock_irqrestore(&ctx->lock, flags); goto again; } /* * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); update_cgrp_time_from_event(event); } perf_event_update_time(event); if (group) perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ret; } /* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); refcount_set(&ctx->refcount, 1); } static void __perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) { epc->pmu = pmu; INIT_LIST_HEAD(&epc->pmu_ctx_entry); INIT_LIST_HEAD(&epc->pinned_active); INIT_LIST_HEAD(&epc->flexible_active); atomic_set(&epc->refcount, 1); } static struct perf_event_context * alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!ctx) return NULL; __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); return ctx; } static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; rcu_read_lock(); if (!vpid) task = current; else task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) return ERR_PTR(-ESRCH); return task; } /* * Returns a matching context with refcount and pincount. */ static struct perf_event_context * find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; unsigned long flags; int err; if (!task) { /* Must be root to operate on a CPU event: */ err = perf_allow_cpu(&event->attr); if (err) return ERR_PTR(err); cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; err = 0; mutex_lock(&task->perf_event_mutex); /* * If it has already passed perf_event_exit_task(). * we must see PF_EXITING, it takes this mutex too. */ if (task->flags & PF_EXITING) err = -ESRCH; else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { put_ctx(ctx); if (err == -EAGAIN) goto retry; goto errout; } } return ctx; errout: return ERR_PTR(err); } static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { struct perf_event_pmu_context *new = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } raw_spin_unlock_irq(&ctx->lock); return epc; } new = kzalloc(sizeof(*epc), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); if (event->attach_state & PERF_ATTACH_TASK_DATA) { task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { kfree(new); return ERR_PTR(-ENOMEM); } } __perf_init_event_pmu_context(new, pmu); /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because perf_event_init_task() doesn't actually hold the * child_ctx->mutex. */ raw_spin_lock_irq(&ctx->lock); list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (epc->pmu == pmu) { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); goto found_epc; } } epc = new; new = NULL; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; found_epc: if (task_ctx_data && !epc->task_ctx_data) { epc->task_ctx_data = task_ctx_data; task_ctx_data = NULL; ctx->nr_task_data++; } raw_spin_unlock_irq(&ctx->lock); free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; } static void get_pmu_ctx(struct perf_event_pmu_context *epc) { WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); kfree(epc->task_ctx_data); kfree(epc); } static void put_pmu_ctx(struct perf_event_pmu_context *epc) { struct perf_event_context *ctx = epc->ctx; unsigned long flags; /* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because of the call-site in _free_event()/put_event() * which isn't always called under ctx->mutex. */ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); raw_spin_unlock_irqrestore(&ctx->lock, flags); if (epc->embedded) return; call_rcu(&epc->rcu_head, free_epc_rcu); } static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { struct perf_event *event = container_of(head, typeof(*event), rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_del_rcu(&event->sb_list); raw_spin_unlock(&pel->lock); } static bool is_sb_event(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; if (event->parent) return false; if (event->attach_state & PERF_ATTACH_TASK) return false; if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; } static void unaccount_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) detach_sb_event(event); } #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif static void unaccount_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL spin_lock(&nr_freq_lock); if (atomic_dec_and_test(&nr_freq_events)) tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void unaccount_freq_event(void) { if (tick_nohz_full_enabled()) unaccount_freq_event_nohz(); else atomic_dec(&nr_freq_events); } static void unaccount_event(struct perf_event *event) { bool dec = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.build_id) atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); if (event->attr.cgroup) atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) dec = true; if (has_branch_stack(event)) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); if (event->attr.text_poke) atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) schedule_delayed_work(&perf_sched_work, HZ); } unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) { mutex_lock(&perf_sched_mutex); if (atomic_dec_and_test(&perf_sched_count)) static_branch_disable(&perf_sched_events); mutex_unlock(&perf_sched_mutex); } /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled * at a time, so we disallow creating events that might conflict, namely: * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return 0; /* * Prevent co-existence of per-task and cpu-wide events on the * same exclusive pmu. * * Negative pmu::exclusive_cnt means there are cpu-wide * events on this "exclusive" pmu, positive means there are * per-task events. * * Since this is called in perf_event_alloc() path, event::ctx * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK * to mean "per-task event", because unlike other attach states it * never gets cleared. */ if (event->attach_state & PERF_ATTACH_TASK) { if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) return -EBUSY; } else { if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) return -EBUSY; } return 0; } static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) return true; return false; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; lockdep_assert_held(&ctx->mutex); if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { if (exclusive_event_match(iter_event, event)) return false; } return true; } static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); unaccount_event(event); security_perf_event_free(event); if (event->rb) { /* * Can happen when we close an event with re-directed output. * * Since we have a 0 refcount, perf_mmap_close() will skip * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) perf_detach_cgroup(event); if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); /* * Must be after ->destroy(), due to uprobe_perf_close() using * hw.target. */ if (event->hw.target) put_task_struct(event->hw.target); if (event->pmu_ctx) put_pmu_ctx(event->pmu_ctx); /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); exclusive_event_destroy(event); module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, "unexpected event refcount: %ld; ptr=%p\n", atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } _free_event(event); } /* * Remove user event from the owner task. */ static void perf_remove_from_owner(struct perf_event *event) { struct task_struct *owner; rcu_read_lock(); /* * Matches the smp_store_release() in perf_event_exit_task(). If we * observe !owner it means the list deletion is complete and we can * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last * task reference we can safely take a new reference * while holding the rcu_read_lock(). */ get_task_struct(owner); } rcu_read_unlock(); if (owner) { /* * If we're here through perf_event_exit_task() we're already * holding ctx->mutex which would be an inversion wrt. the * normal lock order. * * However we can safely take this lock because its the child * ctx->mutex. */ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex * ensured they're done, and we can proceed with freeing the * event. */ if (event->owner) { list_del_init(&event->owner_entry); smp_store_release(&event->owner, NULL); } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } } static void put_event(struct perf_event *event) { if (!atomic_long_dec_and_test(&event->refcount)) return; _free_event(event); } /* * Kill an event dead; while event:refcount will preserve the event * object, it will not preserve its functionality. Once the last 'user' * gives up the object, we'll destroy the thing. */ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); goto no_ctx; } if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ * also see this, most importantly inherit_event() which will avoid * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. * * Since the event cannot get freed while we hold the * child_mutex, the context must also exist and have a !0 * reference count. */ get_ctx(ctx); /* * Now that we have a ctx ref, we can drop child_mutex, and * acquire ctx::mutex without fear of it going away. Then we * can re-acquire child_mutex. */ mutex_unlock(&event->child_mutex); mutex_lock(&ctx->mutex); mutex_lock(&event->child_mutex); /* * Now that we hold ctx::mutex and child_mutex, revalidate our * state, if child is still the first entry, it didn't get freed * and we can continue doing so. */ tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. */ put_event(event); } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { void *var = &child->ctx->refcount; list_del(&child->child_list); free_event(child); /* * Wake any perf_event_free_task() waiting for this event to be * freed. */ smp_mb(); /* pairs with wait_var_event() */ wake_up_var(var); } no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static int perf_release(struct inode *inode, struct file *file) { perf_event_release_kernel(file->private_data); return 0; } static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; *enabled = 0; *running = 0; mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } mutex_unlock(&event->child_mutex); return total; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); count = __perf_event_read_value(event, enabled, running); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; unsigned long flags; int n = 1; /* skip @nr */ int ret; ret = perf_event_read(leader, true); if (ret) return ret; raw_spin_lock_irqsave(&ctx->lock, flags); /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one * set. */ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] += leader->total_time_enabled + atomic64_read(&leader->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] += leader->total_time_running + atomic64_read(&leader->child_total_time_running); } /* * Write {count,id} tuples for every sibling. */ values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); } raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } static int perf_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *child; struct perf_event_context *ctx = leader->ctx; int ret; u64 *values; lockdep_assert_held(&ctx->mutex); values = kzalloc(event->read_size, GFP_KERNEL); if (!values) return -ENOMEM; values[0] = 1 + leader->nr_siblings; /* * By locking the child_mutex of the leader we effectively * lock the child list of all siblings.. XXX explain how. */ mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); if (ret) goto unlock; list_for_each_entry(child, &leader->child_list, child_list) { ret = __perf_read_group_add(child, read_format, values); if (ret) goto unlock; } mutex_unlock(&leader->child_mutex); ret = event->read_size; if (copy_to_user(buf, values, event->read_size)) ret = -EFAULT; goto out; unlock: mutex_unlock(&leader->child_mutex); out: kfree(values); return ret; } static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; return n * sizeof(u64); } static bool is_event_hup(struct perf_event *event) { bool no_children; if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); no_children = list_empty(&event->child_list); mutex_unlock(&event->child_mutex); return no_children; } /* * Read the performance event - simple non blocking version for now */ static ssize_t __perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; /* * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ if (event->state == PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) ret = perf_read_group(event, read_format, buf); else ret = perf_read_one(event, read_format, buf); return ret; } static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; int ret; ret = security_perf_event_read(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; } static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct perf_buffer *rb; __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); if (is_event_hup(event)) return events; /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); rb = event->rb; if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); return events; } static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } /* Assume it's not an event with inherit set. */ u64 perf_event_pause(struct perf_event *event, bool reset) { struct perf_event_context *ctx; u64 count; ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(event->attr.inherit); _perf_event_disable(event); count = local64_read(&event->count); if (reset) local64_set(&event->count, 0); perf_event_ctx_unlock(event, ctx); return count; } EXPORT_SYMBOL_GPL(perf_event_pause); /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) func(child); mutex_unlock(&event->child_mutex); } static void perf_event_for_each(struct perf_event *event, void (*func)(struct perf_event *)) { struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; lockdep_assert_held(&ctx->mutex); event = event->group_leader; perf_event_for_each_child(event, func); for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } static void __perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { u64 value = *((u64 *)info); bool active; if (event->attr.freq) { event->attr.sample_freq = value; } else { event->attr.sample_period = value; event->hw.sample_period = value; } active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. */ if (event->hw.interrupts == MAX_INTERRUPTS) { event->hw.interrupts = 0; perf_log_throttle(event, 1); } event->pmu->stop(event, PERF_EF_UPDATE); } local64_set(&event->hw.period_left, 0); if (active) { event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(event->pmu); } } static int perf_event_check_period(struct perf_event *event, u64 value) { return event->pmu->check_period(event, value); } static int _perf_event_period(struct perf_event *event, u64 value) { if (!is_sampling_event(event)) return -EINVAL; if (!value) return -EINVAL; if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; if (perf_event_check_period(event, value)) return -EINVAL; if (!event->attr.freq && (value & (1ULL << 63))) return -EINVAL; event_function_call(event, __perf_event_period, &value); return 0; } int perf_event_period(struct perf_event *event, u64 value) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_period(event, value); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) { struct fd f = fdget(fd); if (!f.file) return -EBADF; if (f.file->f_op != &perf_fops) { fdput(f); return -EBADF; } *p = f; return 0; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: { u64 value; if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) return -EFAULT; return _perf_event_period(event, value); } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); if (copy_to_user((void __user *)arg, &id, sizeof(id))) return -EFAULT; return 0; } case PERF_EVENT_IOC_SET_OUTPUT: { int ret; if (arg != -1) { struct perf_event *output_event; struct fd output; ret = perf_fget_light(arg, &output); if (ret) return ret; output_event = output.file->private_data; ret = perf_event_set_output(event, output_event); fdput(output); } else { ret = perf_event_set_output(event, NULL); } return ret; } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: { struct bpf_prog *prog; int err; prog = bpf_prog_get(arg); if (IS_ERR(prog)) return PTR_ERR(prog); err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; } return 0; } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb || !rb->nr_pages) { rcu_read_unlock(); return -EINVAL; } rb_toggle_paused(rb, !!arg); rcu_read_unlock(); return 0; } case PERF_EVENT_IOC_QUERY_BPF: return perf_event_query_prog_array(event, (void __user *)arg); case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { struct perf_event_attr new_attr; int err = perf_copy_attr((struct perf_event_attr __user *)arg, &new_attr); if (err) return err; return perf_event_modify_attr(event, &new_attr); } default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) perf_event_for_each(event, func); else perf_event_for_each_child(event, func); return 0; } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_event *event = file->private_data; struct perf_event_context *ctx; long ret; /* Treat ioctl like writes as it is likely a mutating operation. */ ret = security_perf_event_write(event); if (ret) return ret; ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); return ret; } #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; cmd |= sizeof(void *) << IOCSIZE_SHIFT; } break; } return perf_ioctl(file, cmd, arg); } #else # define perf_compat_ioctl NULL #endif int perf_event_task_enable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_enable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(¤t->perf_event_mutex); return 0; } int perf_event_task_disable(void) { struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_disable); perf_event_ctx_unlock(event, ctx); } mutex_unlock(¤t->perf_event_mutex); return 0; } static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) return 0; if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; return event->pmu->event_idx(event); } static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; userpg = rb->user_page; /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); userpg->data_offset = PAGE_SIZE; userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); } void __weak arch_perf_update_userpage( struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we can be called in * NMI context */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); userpg->time_running = running + atomic64_read(&event->child_total_time_running); arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; preempt_enable(); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(perf_event_update_userpage); static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct perf_buffer *rb; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { if (vmf->pgoff == 0) ret = 0; return ret; } rcu_read_lock(); rb = rcu_dereference(event->rb); if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; get_page(vmf->page); vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; unlock: rcu_read_unlock(); return ret; } static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { struct perf_buffer *old_rb = NULL; unsigned long flags; WARN_ON_ONCE(event->parent); if (event->rb) { /* * Should be impossible, we set this when removing * event->rb_entry and wait/clear when adding event->rb_entry. */ WARN_ON_ONCE(event->rcu_pending); old_rb = event->rb; spin_lock_irqsave(&old_rb->event_lock, flags); list_del_rcu(&event->rb_entry); spin_unlock_irqrestore(&old_rb->event_lock, flags); event->rcu_batches = get_state_synchronize_rcu(); event->rcu_pending = 1; } if (rb) { if (event->rcu_pending) { cond_synchronize_rcu(event->rcu_batches); event->rcu_pending = 0; } spin_lock_irqsave(&rb->event_lock, flags); list_add_rcu(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } /* * Avoid racing with perf_mmap_close(AUX): stop the event * before swizzling the event::rb pointer; if it's getting * unmapped, its aux_mmap_count will be 0 and it won't * restart. See the comment in __perf_pmu_output_stop(). * * Data will inevitably be lost when set_output is done in * mid-air, but then again, whoever does it like this is * not in for the data anyway. */ if (has_aux(event)) perf_event_stop(event, 0); rcu_assign_pointer(event->rb, rb); if (old_rb) { ring_buffer_put(old_rb); /* * Since we detached before setting the new rb, so that we * could attach the new rb, we could have missed a wakeup. * Provide it now. */ wake_up_all(&event->waitq); } } static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { list_for_each_entry_rcu(event, &rb->event_list, rb_entry) wake_up_all(&event->waitq); } rcu_read_unlock(); } struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; if (event->parent) event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); return rb; } void ring_buffer_put(struct perf_buffer *rb) { if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). * * In order to undo the VM accounting done by perf_mmap() we need to destroy * the buffer here, where we still have a VM context. This means we need * to detach all events redirecting to us. */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and * event->mmap_count, so it is ok to use event->mmap_mutex to * serialize with perf_mmap here. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU * data. Note that after rb::aux_mmap_count dropped to zero, * they won't start any more (see perf_aux_output_begin()). */ perf_pmu_output_stop(event); /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ if (!detach_rest) goto out_put; /* * No other mmap()s, detach from all other events that might redirect * into the now unreachable buffer. Somewhat complicated by the * fact that rb::event_lock otherwise nests inside mmap_mutex. */ again: rcu_read_lock(); list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { if (!atomic_long_inc_not_zero(&event->refcount)) { /* * This event is en-route to free_event() which will * detach it and remove it from the list. */ continue; } rcu_read_unlock(); mutex_lock(&event->mmap_mutex); /* * Check we didn't race with perf_event_set_output() which can * swizzle the rb from under us while we were waiting to * acquire mmap_mutex. * * If we find a different rb; ignore this event, a next * iteration will no longer find it on the list. We have to * still restart the iteration to make sure we're not now * iterating the wrong list. */ if (event->rb == rb) ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); put_event(event); /* * Restart the iteration; either we're on the wrong list or * destroyed its integrity by doing a deletion. */ goto again; } rcu_read_unlock(); /* * It could be there's still a few 0-ref events on the list; they'll * get cleaned up by free_event() -- they'll also still have their * ref on the rb and will free it whenever they are done with it. * * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; ret = security_perf_event_read(event); if (ret) return ret; vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { nr_pages = (vma_size / PAGE_SIZE) - 1; } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already * mapped, all subsequent mappings should have the same size * and offset. Must be above the normal perf buffer. */ u64 aux_offset, aux_size; if (!event->rb) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; mutex_lock(&event->mmap_mutex); ret = -EINVAL; rb = event->rb; if (!rb) goto aux_unlock; aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); if (aux_offset < perf_data_size(rb) + PAGE_SIZE) goto aux_unlock; if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) goto aux_unlock; /* already mapped with a different offset */ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) goto aux_unlock; if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) goto aux_unlock; /* already mapped with a different size */ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) goto aux_unlock; if (!is_power_of_2(nr_pages)) goto aux_unlock; if (!atomic_inc_not_zero(&rb->mmap_count)) goto aux_unlock; if (rb_has_aux(rb)) { atomic_inc(&rb->aux_mmap_count); ret = 0; goto unlock; } atomic_set(&rb->aux_mmap_count, 1); user_extra = nr_pages; goto accounting; } /* * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); if (event->rb) { if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* * Raced against perf_mmap_close(); remove the * event and try again. */ ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } goto unlock; } user_extra = nr_pages + 1; accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: */ user_lock_limit *= num_online_cpus(); user_locked = atomic_long_read(&user->locked_vm); /* * sysctl_perf_event_mlock may have changed, so that * user->locked_vm > user_lock_limit */ if (user_locked > user_lock_limit) user_locked = user_lock_limit; user_locked += user_extra; if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm */ extra = user_locked - user_lock_limit; user_extra -= extra; } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; if (!rb) { rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0, event->cpu, flags); if (!rb) { ret = -ENOMEM; goto unlock; } atomic_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { atomic_dec(&rb->mmap_count); } aux_unlock: mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; } static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); if (retval < 0) return retval; return 0; } static const struct file_operations perf_fops = { .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; /* * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } static void perf_sigtrap(struct perf_event *event) { /* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the * case on architectures that do not implement arch_irq_work_raise(). */ if (WARN_ON_ONCE(event->ctx->task != current)) return; /* * Both perf_pending_task() and perf_pending_irq() can race with the * task exiting. */ if (current->flags & PF_EXITING) return; send_sig_perf((void __user *)event->pending_addr, event->orig_type, event->attr.sig_data); } /* * Deliver the pending work in-event-context or follow the context. */ static void __perf_pending_irq(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); /* * If the event isn't running; we done. event_sched_out() will have * taken care of things. */ if (cpu < 0) return; /* * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { if (event->pending_sigtrap) { event->pending_sigtrap = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); } return; } /* * CPU-A CPU-B * * perf_event_disable_inatomic() * @pending_disable = CPU-A; * irq_work_queue(); * * sched-out * @pending_disable = -1; * * sched-in * perf_event_disable_inatomic() * @pending_disable = CPU-B; * irq_work_queue(); // FAILS * * irq_work_run() * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ irq_work_queue_on(&event->pending_irq, cpu); } static void perf_pending_irq(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ rctx = perf_swevent_get_recursion_context(); /* * The wakeup isn't bound to the context of the event -- it can happen * irrespective of where the event is. */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } __perf_pending_irq(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } static void perf_pending_task(struct callback_head *head) { struct perf_event *event = container_of(head, struct perf_event, pending_task); int rctx; /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_pending); } if (rctx >= 0) perf_swevent_put_recursion_context(rctx); preempt_enable_notrace(); put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); static_call_update(__perf_guest_state, cbs->state); static_call_update(__perf_guest_get_ip, cbs->get_ip); /* Implementing ->handle_intel_pt_intr is optional. */ if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) return; rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; DECLARE_BITMAP(_mask, 64); bitmap_from_u64(_mask, mask); for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); perf_output_put(handle, val); } } static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; } } static void perf_sample_regs_intr(struct perf_regs *regs_intr, struct pt_regs *regs) { regs_intr->regs = regs; regs_intr->abi = perf_reg_abi(current); } /* * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) { unsigned long addr = perf_user_stack_pointer(regs); if (!addr || addr >= TASK_SIZE) return 0; return TASK_SIZE - addr; } static u16 perf_sample_ustack_size(u16 stack_size, u16 header_size, struct pt_regs *regs) { u64 task_size; /* No regs, no stack pointer, no dump. */ if (!regs) return 0; /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size * If we don't, we customize the stack size to * fit in to the remaining sample size. */ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); stack_size = min(stack_size, (u16) task_size); /* Current header size plus static size and dynamic size. */ header_size += 2 * sizeof(u64); /* Do we fit in with the current stack dump size? */ if ((u16) (header_size + stack_size) < header_size) { /* * If we overflow the maximum size for the sample, * we customize the stack dump size to fit in. */ stack_size = USHRT_MAX - header_size - sizeof(u64); stack_size = round_up(stack_size, sizeof(u64)); } return stack_size; } static void perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, struct pt_regs *regs) { /* Case of a kernel thread, nothing to dump */ if (!regs) { u64 size = 0; perf_output_put(handle, size); } else { unsigned long sp; unsigned int rem; u64 dyn_size; /* * We dump: * static size * - the size requested by user or the best one we can fit * in to the sample max size * data * - user stack dump data * dynamic size * - the actual dumped size */ /* Static size. */ perf_output_put(handle, dump_size); /* Data. */ sp = perf_user_stack_pointer(regs); rem = __output_copy_user(handle, (void *) sp, dump_size); dyn_size = dump_size - rem; perf_output_skip(handle, rem); /* Dynamic size. */ perf_output_put(handle, dyn_size); } } static unsigned long perf_prepare_sample_aux(struct perf_event *event, struct perf_sample_data *data, size_t size) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; data->aux_size = 0; if (!sampler) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) goto out; if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; rb = ring_buffer_get(sampler); if (!rb) goto out; /* * If this is an NMI hit inside sampling code, don't take * the sample. See also perf_aux_sample_output(). */ if (READ_ONCE(rb->aux_in_sampling)) { data->aux_size = 0; } else { size = min_t(size_t, size, perf_aux_size(rb)); data->aux_size = ALIGN(size, sizeof(u64)); } ring_buffer_put(rb); out: return data->aux_size; } static long perf_pmu_snapshot_aux(struct perf_buffer *rb, struct perf_event *event, struct perf_output_handle *handle, unsigned long size) { unsigned long flags; long ret; /* * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler * paths. If we start calling them in NMI context, they may race with * the IRQ ones, that is, for example, re-starting an event that's just * been stopped, which is why we're using a separate callback that * doesn't change the event state. * * IRQs need to be disabled to prevent IPIs from racing with us. */ local_irq_save(flags); /* * Guard against NMI hits inside the critical section; * see also perf_prepare_sample_aux(). */ WRITE_ONCE(rb->aux_in_sampling, 1); barrier(); ret = event->pmu->snapshot_aux(event, handle, size); barrier(); WRITE_ONCE(rb->aux_in_sampling, 0); local_irq_restore(flags); return ret; } static void perf_aux_sample_output(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *data) { struct perf_event *sampler = event->aux_event; struct perf_buffer *rb; unsigned long pad; long size; if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; rb = ring_buffer_get(sampler); if (!rb) return; size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); /* * An error here means that perf_output_copy() failed (returned a * non-zero surplus that it didn't copy), which in its current * enlightened implementation is not possible. If that changes, we'd * like to know. */ if (WARN_ON_ONCE(size < 0)) goto out_put; /* * The pad comes from ALIGN()ing data->aux_size up to u64 in * perf_prepare_sample_aux(), so should not be more than that. */ pad = data->aux_size - size; if (WARN_ON_ONCE(pad >= sizeof(u64))) pad = 8; if (pad) { u64 zero = 0; perf_output_copy(handle, &zero, pad); } out_put: ring_buffer_put(rb); } /* * A set of common sample data types saved even for non-sample records * when event->attr.sample_id_all is set. */ #define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) static void __perf_event_header__init_id(struct perf_sample_data *data, struct perf_event *event, u64 sample_type) { data->type = event->attr.sample_type; data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ data->tid_entry.pid = perf_event_pid(event, current); data->tid_entry.tid = perf_event_tid(event, current); } if (sample_type & PERF_SAMPLE_TIME) data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) data->stream_id = event->id; if (sample_type & PERF_SAMPLE_CPU) { data->cpu_entry.cpu = raw_smp_processor_id(); data->cpu_entry.reserved = 0; } } void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { if (event->attr.sample_id_all) { header->size += event->id_header_size; __perf_event_header__init_id(data, event, event->attr.sample_type); } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, struct perf_sample_data *data) { u64 sample_type = data->type; if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; /* * Disabling interrupts avoids all counter scheduling * (context switches, timer based rotation and IPIs). */ local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if ((leader != event) && (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); for_each_sibling_event(sub, leader) { n = 0; if ((sub != event) && (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) /* * XXX PERF_SAMPLE_READ vs inherited events seems difficult. * * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we are called in * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); else perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { u64 sample_type = data->type; perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); if (sample_type & PERF_SAMPLE_TID) perf_output_put(handle, data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) perf_output_put(handle, data->time); if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(handle, data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(handle, data->stream_id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; size += data->callchain->nr; size *= sizeof(u64); __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { struct perf_raw_record *raw = data->raw; if (raw) { struct perf_raw_frag *frag = &raw->frag; perf_output_put(handle, raw->size); do { if (frag->copy) { __output_custom(handle, frag->copy, frag->data, frag->size); } else { __output_copy(handle, frag->data, frag->size); } if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); if (frag->pad) __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; u32 data; } raw = { .size = sizeof(u32), .data = 0, }; perf_output_put(handle, raw); } } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* * we always store at least the value of nr */ u64 nr = 0; perf_output_put(handle, nr); } } if (sample_type & PERF_SAMPLE_REGS_USER) { u64 abi = data->regs_user.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_user; perf_output_sample_regs(handle, data->regs_user.regs, mask); } } if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); } if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); if (sample_type & PERF_SAMPLE_REGS_INTR) { u64 abi = data->regs_intr.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). */ perf_output_put(handle, abi); if (abi) { u64 mask = event->attr.sample_regs_intr; perf_output_sample_regs(handle, data->regs_intr.regs, mask); } } if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) perf_output_put(handle, data->code_page_size); if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); if (data->aux_size) perf_aux_sample_output(event, handle, data); } if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { local_sub(wakeup_events, &rb->events); local_inc(&rb->wakeup); } } } } static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; if (!virt) return 0; if (virt >= TASK_SIZE) { /* If it's vmalloc()d memory, leave phys_addr as 0 */ if (virt_addr_valid((void *)(uintptr_t)virt) && !(virt >= VMALLOC_START && virt < VMALLOC_END)) phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); } else { /* * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { struct page *p; pagefault_disable(); if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; put_page(p); } pagefault_enable(); } } return phys_addr; } /* * Return the pagetable size of a given virtual address. */ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; #ifdef CONFIG_HAVE_FAST_GUP pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pgdp = pgd_offset(mm, addr); pgd = READ_ONCE(*pgdp); if (pgd_none(pgd)) return 0; if (pgd_leaf(pgd)) return pgd_leaf_size(pgd); p4dp = p4d_offset_lockless(pgdp, pgd, addr); p4d = READ_ONCE(*p4dp); if (!p4d_present(p4d)) return 0; if (p4d_leaf(p4d)) return p4d_leaf_size(p4d); pudp = pud_offset_lockless(p4dp, p4d, addr); pud = READ_ONCE(*pudp); if (!pud_present(pud)) return 0; if (pud_leaf(pud)) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); again: pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; if (pmd_leaf(pmd)) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); if (!ptep) goto again; pte = ptep_get_lockless(ptep); if (pte_present(pte)) size = pte_leaf_size(pte); pte_unmap(ptep); #endif /* CONFIG_HAVE_FAST_GUP */ return size; } static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; unsigned long flags; u64 size; if (!addr) return 0; /* * Software page-table walkers must disable IRQs, * which prevents any tear down of the page tables. */ local_irq_save(flags); mm = current->mm; if (!mm) { /* * For kernel threads and the like, use init_mm so that * we can find kernel memory. */ mm = &init_mm; } size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags); return size; } static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; if (!kernel && !user) return &__empty_callchain; callchain = get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) { return d * !!(flags & s); } void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; u64 filtered_sample_type; /* * Add the sample flags that are dependent to others. And clear the * sample flags that have already been done by the PMU driver. */ filtered_sample_type = sample_type; filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER); filtered_sample_type &= ~data->sample_flags; if (filtered_sample_type == 0) { /* Make sure it has the correct data->type for output */ data->type = event->attr.sample_type; return; } __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { data->ip = perf_instruction_pointer(regs); data->sample_flags |= PERF_SAMPLE_IP; } if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) perf_sample_save_callchain(data, event, regs); if (filtered_sample_type & PERF_SAMPLE_RAW) { data->raw = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_RAW; } if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { data->br_stack = NULL; data->dyn_size += sizeof(u64); data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); /* * It cannot use the filtered_sample_type here as REGS_USER can be set * by STACK_USER (using __cond_set() above) and we don't want to update * the dyn_size if it's not requested by users. */ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); if (data->regs_user.regs) { u64 mask = event->attr.sample_regs_user; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_USER; } if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* * If there is something to dump, add space for the dump * itself and for the field that tells the dynamic size, * which is how many have been actually dumped. */ if (stack_size) size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_STACK_USER; } if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = PERF_MEM_NA; data->sample_flags |= PERF_SAMPLE_DATA_SRC; } if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = 0; data->sample_flags |= PERF_SAMPLE_TRANSACTION; } if (filtered_sample_type & PERF_SAMPLE_ADDR) { data->addr = 0; data->sample_flags |= PERF_SAMPLE_ADDR; } if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); perf_sample_regs_intr(&data->regs_intr, regs); if (data->regs_intr.regs) { u64 mask = event->attr.sample_regs_intr; size += hweight64(mask) * sizeof(u64); } data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_REGS_INTR; } if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; } #ifdef CONFIG_CGROUP_PERF if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif /* * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; } if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; u16 header_size = perf_sample_data_size(data, event); header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can * easily overflow it, what with all the preceding sample bits. * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); WARN_ON_ONCE(size + header_size > U16_MAX); data->dyn_size += size + sizeof(u64); /* size above */ data->sample_flags |= PERF_SAMPLE_AUX; } } void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); header->misc = perf_misc_flags(regs); /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the * lowest 3 bits of size, which should be always zero at the moment. * This raises a more important question, do we really need 512k sized * samples and why, so good argumentation is in order for whatever you * do here next. */ WARN_ON_ONCE(header->size & 7); } static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, struct perf_sample_data *, struct perf_event *, unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(data, event, regs); perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); exit: rcu_read_unlock(); return err; } void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_forward); } void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { __perf_event_output(event, data, regs, perf_output_begin_backward); } int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_output(event, data, regs, perf_output_begin); } /* * read event_id */ struct perf_read_event { struct perf_event_header header; u32 pid; u32 tid; }; static void perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void perf_iterate_ctx(struct perf_event_context *ctx, perf_iterate_f output, void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (!all) { if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; } output(event, data); } } static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { /* * Skip events that are not fully formed yet; ensure that * if we observe event->ctx, both event and ctx will be * complete enough. See perf_install_in_context(). */ if (!smp_load_acquire(&event->ctx)) continue; if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue; output(event, data); } } /* * Iterate all events that need to receive side-band events. * * For new callers; ensure that account_pmu_sb_event() includes * your event, otherwise it might not get delivered. */ static void perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; rcu_read_lock(); preempt_disable(); /* * If we have task_ctx != NULL we only notify the task context itself. * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { perf_iterate_ctx(task_ctx, output, data, false); goto done; } perf_iterate_sb_cpu(output, data); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); } /* * Clear all file-based filters at exec, they'll have to be * re-instated when/if these objects are mmapped again. */ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; restart++; } count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } void perf_event_exec(void) { struct perf_event_context *ctx; ctx = perf_pin_task_context(current); if (!ctx) return; perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); } struct remote_output { struct perf_buffer *rb; int err; }; static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; if (!has_aux(event)) return; if (!parent) parent = event; /* * In case of inheritance, it will be the parent that links to the * ring-buffer, but it will be the child that's actually using it. * * We are using event::rb to determine if the event should be stopped, * however this may race with ring_buffer_attach() (through set_output), * which will make us skip the event that actually needs to be stopped. * So ring_buffer_attach() has to stop an aux event before re-assigning * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); } static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; rcu_read_lock(); perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); return ro.err; } static void perf_pmu_output_stop(struct perf_event *event) { struct perf_event *iter; int err, cpu; restart: rcu_read_lock(); list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { /* * For per-CPU events, we need to make sure that neither they * nor their children are running; for cpu==-1 events it's * sufficient to stop the event itself if it's active, since * it can't have children. */ cpu = iter->cpu; if (cpu == -1) cpu = READ_ONCE(iter->oncpu); if (cpu == -1) continue; err = cpu_function_call(cpu, __perf_pmu_output_stop, event); if (err == -EAGAIN) { rcu_read_unlock(); goto restart; } } rcu_read_unlock(); } /* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { struct task_struct *task; struct perf_event_context *task_ctx; struct { struct perf_event_header header; u32 pid; u32 ppid; u32 tid; u32 ptid; u64 time; } event_id; }; static int perf_event_task_match(struct perf_event *event) { return event->attr.comm || event->attr.mmap || event->attr.mmap2 || event->attr.mmap_data || event->attr.task; } static void perf_event_task_output(struct perf_event *event, void *data) { struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; if (!perf_event_task_match(event)) return; perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.tid = perf_event_tid(event, task); if (task_event->event_id.header.type == PERF_RECORD_EXIT) { task_event->event_id.ppid = perf_event_pid(event, task->real_parent); task_event->event_id.ptid = perf_event_pid(event, task->real_parent); } else { /* PERF_RECORD_FORK */ task_event->event_id.ppid = perf_event_pid(event, current); task_event->event_id.ptid = perf_event_tid(event, current); } task_event->event_id.time = perf_event_clock(event); perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: task_event->event_id.header.size = size; } static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; if (!atomic_read(&nr_comm_events) && !atomic_read(&nr_mmap_events) && !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, .event_id = { .header = { .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ /* .tid */ /* .ptid */ /* .time */ }, }; perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); } /* * comm tracking */ struct perf_comm_event { struct task_struct *task; char *comm; int comm_size; struct { struct perf_event_header header; u32 pid; u32 tid; } event_id; }; static int perf_event_comm_match(struct perf_event *event) { return event->attr.comm; } static void perf_event_comm_output(struct perf_event *event, void *data) { struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; int ret; if (!perf_event_comm_match(event)) return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: comm_event->event_id.header.size = size; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; unsigned int size; memset(comm, 0, sizeof(comm)); strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ .event_id = { .header = { .type = PERF_RECORD_COMM, .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ /* .tid */ }, }; perf_event_comm_event(&comm_event); } /* * namespaces tracking */ struct perf_namespaces_event { struct task_struct *task; struct { struct perf_event_header header; u32 pid; u32 tid; u64 nr_namespaces; struct perf_ns_link_info link_info[NR_NAMESPACES]; } event_id; }; static int perf_event_namespaces_match(struct perf_event *event) { return event->attr.namespaces; } static void perf_event_namespaces_output(struct perf_event *event, void *data) { struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) return; perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); namespaces_event->event_id.tid = perf_event_tid(event, namespaces_event->task); perf_output_put(&handle, namespaces_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct path ns_path; struct inode *ns_inode; int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; path_put(&ns_path); } } void perf_event_namespaces(struct task_struct *task) { struct perf_namespaces_event namespaces_event; struct perf_ns_link_info *ns_link_info; if (!atomic_read(&nr_namespaces_events)) return; namespaces_event = (struct perf_namespaces_event){ .task = task, .event_id = { .header = { .type = PERF_RECORD_NAMESPACES, .misc = 0, .size = sizeof(namespaces_event.event_id), }, /* .pid */ /* .tid */ .nr_namespaces = NR_NAMESPACES, /* .link_info[NR_NAMESPACES] */ }, }; ns_link_info = namespaces_event.event_id.link_info; perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], task, &mntns_operations); #ifdef CONFIG_USER_NS perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], task, &userns_operations); #endif #ifdef CONFIG_NET_NS perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], task, &netns_operations); #endif #ifdef CONFIG_UTS_NS perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], task, &utsns_operations); #endif #ifdef CONFIG_IPC_NS perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], task, &ipcns_operations); #endif #ifdef CONFIG_PID_NS perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], task, &pidns_operations); #endif #ifdef CONFIG_CGROUPS perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], task, &cgroupns_operations); #endif perf_iterate_sb(perf_event_namespaces_output, &namespaces_event, NULL); } /* * cgroup tracking */ #ifdef CONFIG_CGROUP_PERF struct perf_cgroup_event { char *path; int path_size; struct { struct perf_event_header header; u64 id; char path[]; } event_id; }; static int perf_event_cgroup_match(struct perf_event *event) { return event->attr.cgroup; } static void perf_event_cgroup_output(struct perf_event *event, void *data) { struct perf_cgroup_event *cgroup_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u16 header_size = cgroup_event->event_id.header.size; int ret; if (!perf_event_cgroup_match(event)) return; perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; perf_output_put(&handle, cgroup_event->event_id); __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: cgroup_event->event_id.header.size = header_size; } static void perf_event_cgroup(struct cgroup *cgrp) { struct perf_cgroup_event cgroup_event; char path_enomem[16] = "//enomem"; char *pathname; size_t size; if (!atomic_read(&nr_cgroup_events)) return; cgroup_event = (struct perf_cgroup_event){ .event_id = { .header = { .type = PERF_RECORD_CGROUP, .misc = 0, .size = sizeof(cgroup_event.event_id), }, .id = cgroup_id(cgrp), }, }; pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (pathname == NULL) { cgroup_event.path = path_enomem; } else { /* just to be sure to have enough space for alignment */ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); cgroup_event.path = pathname; } /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(cgroup_event.path) + 1; while (!IS_ALIGNED(size, sizeof(u64))) cgroup_event.path[size++] = '\0'; cgroup_event.event_id.header.size += size; cgroup_event.path_size = size; perf_iterate_sb(perf_event_cgroup_output, &cgroup_event, NULL); kfree(pathname); } #endif /* * mmap tracking */ struct perf_mmap_event { struct vm_area_struct *vma; const char *file_name; int file_size; int maj, min; u64 ino; u64 ino_generation; u32 prot, flags; u8 build_id[BUILD_ID_SIZE_MAX]; u32 build_id_size; struct { struct perf_event_header header; u32 pid; u32 tid; u64 start; u64 len; u64 pgoff; } event_id; }; static int perf_event_mmap_match(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct vm_area_struct *vma = mmap_event->vma; int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, void *data) { struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) return; if (event->attr.mmap2) { mmap_event->event_id.header.type = PERF_RECORD_MMAP2; mmap_event->event_id.header.size += sizeof(mmap_event->maj); mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); mmap_event->event_id.header.size += sizeof(mmap_event->prot); mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); use_build_id = event->attr.build_id && mmap_event->build_id_size; if (event->attr.mmap2 && use_build_id) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { if (use_build_id) { u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; __output_copy(&handle, size, 4); __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); } else { perf_output_put(&handle, mmap_event->maj); perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); out: mmap_event->event_id.header.size = size; mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; if (vma->vm_flags & VM_WRITE) prot |= PROT_WRITE; if (vma->vm_flags & VM_EXEC) prot |= PROT_EXEC; if (vma->vm_flags & VM_MAYSHARE) flags = MAP_SHARED; else flags = MAP_PRIVATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) { name = "//enomem"; goto cpy_name; } /* * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); goto got_name; } else { if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); if (!name) name = (char *)arch_vma_name(vma); if (!name) { if (vma_is_initial_heap(vma)) name = "[heap]"; else if (vma_is_initial_stack(vma)) name = "[stack]"; else name = "//anon"; } } cpy_name: strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace. */ size = strlen(name)+1; while (!IS_ALIGNED(size, sizeof(u64))) name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; mmap_event->maj = maj; mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; mmap_event->prot = prot; mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } /* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { /* d_inode(NULL) won't be equal to any mapped user-space file */ if (!filter->path.dentry) return false; if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) return false; if (filter->offset + filter->size < offset) return false; return true; } static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, struct vm_area_struct *vma, struct perf_addr_filter_range *fr) { unsigned long vma_size = vma->vm_end - vma->vm_start; unsigned long off = vma->vm_pgoff << PAGE_SHIFT; struct file *file = vma->vm_file; if (!perf_addr_filter_match(filter, file, off, vma_size)) return false; if (filter->offset < off) { fr->start = vma->vm_start; fr->size = min(vma_size, filter->size - (off - filter->offset)); } else { fr->start = vma->vm_start + filter->offset - off; fr->size = min(vma->vm_end - fr->start, filter->size); } return true; } static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; unsigned long flags; if (!has_addr_filter(event)) return; if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (perf_addr_filter_vma_adjust(filter, vma, &event->addr_filter_ranges[count])) restart++; count++; } if (restart) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) perf_event_stop(event, 1); } /* * Adjust all task's events' filters to the new vma */ static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; /* * Data tracing isn't supported yet and as such there is no need * to keep track of anything that isn't related to executable code: */ if (!(vma->vm_flags & VM_EXEC)) return; rcu_read_lock(); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); rcu_read_unlock(); } void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ .event_id = { .header = { .type = PERF_RECORD_MMAP, .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, /* .maj (attr_mmap2 only) */ /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ /* .prot (attr_mmap2 only) */ /* .flags (attr_mmap2 only) */ }; perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } void perf_event_aux_event(struct perf_event *event, unsigned long head, unsigned long size, u64 flags) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 offset; u64 size; u64 flags; } rec = { .header = { .type = PERF_RECORD_AUX, .misc = 0, .size = sizeof(rec), }, .offset = head, .size = size, .flags = flags, }; int ret; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * Lost/dropped samples logging */ void perf_log_lost_samples(struct perf_event *event, u64 lost) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 lost; } lost_samples_event = { .header = { .type = PERF_RECORD_LOST_SAMPLES, .misc = 0, .size = sizeof(lost_samples_event), }, .lost = lost, }; perf_event_header__init_id(&lost_samples_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; perf_output_put(&handle, lost_samples_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * context_switch tracking */ struct perf_switch_event { struct task_struct *task; struct task_struct *next_prev; struct { struct perf_event_header header; u32 next_prev_pid; u32 next_prev_tid; } event_id; }; static int perf_event_switch_match(struct perf_event *event) { return event->attr.context_switch; } static void perf_event_switch_output(struct perf_event *event, void *data) { struct perf_switch_event *se = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_switch_match(event)) return; /* Only CPU-wide events are allowed to see next/prev pid/tid */ if (event->ctx->task) { se->event_id.header.type = PERF_RECORD_SWITCH; se->event_id.header.size = sizeof(se->event_id.header); } else { se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; se->event_id.header.size = sizeof(se->event_id); se->event_id.next_prev_pid = perf_event_pid(event, se->next_prev); se->event_id.next_prev_tid = perf_event_tid(event, se->next_prev); } perf_event_header__init_id(&se->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; if (event->ctx->task) perf_output_put(&handle, se->event_id.header); else perf_output_put(&handle, se->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in) { struct perf_switch_event switch_event; /* N.B. caller checks nr_switch_events != 0 */ switch_event = (struct perf_switch_event){ .task = task, .next_prev = next_prev, .event_id = { .header = { /* .type */ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, /* .size */ }, /* .next_prev_pid */ /* .next_prev_tid */ }, }; if (!sched_in && task->on_rq) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* * IRQ throttle logging */ static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; struct perf_sample_data sample; int ret; struct { struct perf_event_header header; u64 time; u64 id; u64 stream_id; } throttle_event = { .header = { .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; perf_output_put(&handle, throttle_event); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } /* * ksymbol register/unregister tracking */ struct perf_ksymbol_event { const char *name; int name_len; struct { struct perf_event_header header; u64 addr; u32 len; u16 ksym_type; u16 flags; } event_id; }; static int perf_event_ksymbol_match(struct perf_event *event) { return event->attr.ksymbol; } static void perf_event_ksymbol_output(struct perf_event *event, void *data) { struct perf_ksymbol_event *ksymbol_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_ksymbol_match(event)) return; perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; perf_output_put(&handle, ksymbol_event->event_id); __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { struct perf_ksymbol_event ksymbol_event; char name[KSYM_NAME_LEN]; u16 flags = 0; int name_len; if (!atomic_read(&nr_ksymbol_events)) return; if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); if (unregister) flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; ksymbol_event = (struct perf_ksymbol_event){ .name = name, .name_len = name_len, .event_id = { .header = { .type = PERF_RECORD_KSYMBOL, .size = sizeof(ksymbol_event.event_id) + name_len, }, .addr = addr, .len = len, .ksym_type = ksym_type, .flags = flags, }, }; perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); return; err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } /* * bpf program load/unload tracking */ struct perf_bpf_event { struct bpf_prog *prog; struct { struct perf_event_header header; u16 type; u16 flags; u32 id; u8 tag[BPF_TAG_SIZE]; } event_id; }; static int perf_event_bpf_match(struct perf_event *event) { return event->attr.bpf_event; } static void perf_event_bpf_output(struct perf_event *event, void *data) { struct perf_bpf_event *bpf_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int ret; if (!perf_event_bpf_match(event)) return; perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, bpf_event->event_id.header.size); if (ret) return; perf_output_put(&handle, bpf_event->event_id); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; if (prog->aux->func_cnt == 0) { perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, prog->jited_len, unregister, prog->aux->ksym.name); } else { for (i = 0; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, subprog->aux->ksym.name); } } } void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { struct perf_bpf_event bpf_event; if (type <= PERF_BPF_EVENT_UNKNOWN || type >= PERF_BPF_EVENT_MAX) return; switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: if (atomic_read(&nr_ksymbol_events)) perf_event_bpf_emit_ksymbols(prog, type); break; default: break; } if (!atomic_read(&nr_bpf_events)) return; bpf_event = (struct perf_bpf_event){ .prog = prog, .event_id = { .header = { .type = PERF_RECORD_BPF_EVENT, .size = sizeof(bpf_event.event_id), }, .type = type, .flags = flags, .id = prog->aux->id, }, }; BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; size_t pad; u16 old_len; u16 new_len; struct { struct perf_event_header header; u64 addr; } event_id; }; static int perf_event_text_poke_match(struct perf_event *event) { return event->attr.text_poke; } static void perf_event_text_poke_output(struct perf_event *event, void *data) { struct perf_text_poke_event *text_poke_event = data; struct perf_output_handle handle; struct perf_sample_data sample; u64 padding = 0; int ret; if (!perf_event_text_poke_match(event)) return; perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, text_poke_event->event_id.header.size); if (ret) return; perf_output_put(&handle, text_poke_event->event_id); perf_output_put(&handle, text_poke_event->old_len); perf_output_put(&handle, text_poke_event->new_len); __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); if (text_poke_event->pad) __output_copy(&handle, &padding, text_poke_event->pad); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { struct perf_text_poke_event text_poke_event; size_t tot, pad; if (!atomic_read(&nr_text_poke_events)) return; tot = sizeof(text_poke_event.old_len) + old_len; tot += sizeof(text_poke_event.new_len) + new_len; pad = ALIGN(tot, sizeof(u64)) - tot; text_poke_event = (struct perf_text_poke_event){ .old_bytes = old_bytes, .new_bytes = new_bytes, .pad = pad, .old_len = old_len, .new_len = new_len, .event_id = { .header = { .type = PERF_RECORD_TEXT_POKE, .misc = PERF_RECORD_MISC_KERNEL, .size = sizeof(text_poke_event.event_id) + tot + pad, }, .addr = (unsigned long)addr, }, }; perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); } void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; } static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u32 pid; u32 tid; } rec; int ret; if (event->parent) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.pid = perf_event_pid(event, current); rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) { struct perf_output_handle handle; struct perf_sample_data sample; struct perf_aux_event { struct perf_event_header header; u64 hw_id; } rec; int ret; if (event->parent) event = event->parent; rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; rec.header.misc = 0; rec.header.size = sizeof(rec); rec.hw_id = hw_id; perf_event_header__init_id(&rec.header, &sample, event); ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; perf_output_put(&handle, rec); perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } EXPORT_SYMBOL_GPL(perf_report_aux_output_id); static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { struct hw_perf_event *hwc = &event->hw; int ret = 0; u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { hwc->interrupts_seq = seq; hwc->interrupts = 1; } else { hwc->interrupts++; if (unlikely(throttle && hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } } if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_time_stamp; hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) perf_adjust_period(event, delta, hwc->last_period, true); } return ret; } int perf_event_account_interrupt(struct perf_event *event) { return __perf_event_account_interrupt(event, 1); } static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) { /* * Due to interrupt latency (AKA "skid"), we may enter the * kernel before taking an overflow, even if the PMU is only * counting user events. */ if (event->attr.exclude_kernel && !user_mode(regs)) return false; return true; } /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; /* * Non-sampling counters might still use the PMI to fold short * hardware counters, ignore those. */ if (unlikely(!is_sampling_event(event))) return 0; ret = __perf_event_account_interrupt(event, throttle); /* * XXX event_limit might not quite work as expected on inherited * events */ event->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); } if (event->attr.sigtrap) { /* * The desired behaviour of sigtrap vs invalid samples is a bit * tricky; on the one hand, one should not loose the SIGTRAP if * it is the first event, on the other hand, we should also not * trigger the WARN or override the data address. */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; if (!event->pending_sigtrap) { event->pending_sigtrap = pending_id; local_inc(&event->ctx->nr_pending); } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without * consuming pending_sigtrap; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. * * 2. Events that can overflow again before the IRQ- * work without user space progress (e.g. hrtimer). * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ WARN_ON_ONCE(event->pending_sigtrap != pending_id); } event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) event->pending_addr = data->addr; irq_work_queue(&event->pending_irq); } READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } /* * Generic software event infrastructure */ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; hwc->last_period = hwc->sample_period; old = local64_read(&hwc->period_left); do { val = old; if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; if (!overflow) overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. */ break; } throttle = 1; } } static void perf_swevent_event(struct perf_event *event, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; local64_add(nr, &event->count); if (!regs) return; if (!is_sampling_event(event)) return; if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); } else data->period = event->hw.last_period; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; if (event->attr.exclude_kernel && !user_mode(regs)) return 1; } return 0; } static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { if (event->attr.type != type) return 0; if (event->attr.config != event_id) return 0; if (perf_exclude_event(event, regs)) return 0; return 1; } static inline u64 swevent_hash(u64 type, u32 event_id) { u64 val = event_id | (type << 32); return hash_64(val, SWEVENT_HLIST_BITS); } static inline struct hlist_head * __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) { u64 hash = swevent_hash(type, event_id); return &hlist->heads[hash]; } /* For the read side: events when they trigger */ static inline struct hlist_head * find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; u64 type = event->attr.type; /* * Event scheduling is always serialized against hlist allocation * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; return __find_swevent_head(hlist, type, event_id); } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; rcu_read_lock(); head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); } DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); } void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { int rctx; preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (unlikely(rctx < 0)) goto fail; ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); fail: preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) { } static int perf_swevent_add(struct perf_event *event, int flags) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); return 0; } static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } static void perf_swevent_start(struct perf_event *event, int flags) { event->hw.state = 0; } static void perf_swevent_stop(struct perf_event *event, int flags) { event->hw.state = PERF_HES_STOPPED; } /* Deref the hlist from the update side */ static inline struct swevent_hlist * swevent_hlist_deref(struct swevent_htable *swhash) { return rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (!--swhash->hlist_refcount) swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) swevent_hlist_put_cpu(cpu); } static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; mutex_lock(&swhash->hlist_mutex); if (!swevent_hlist_deref(swhash) && cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); if (!hlist) { err = -ENOMEM; goto exit; } rcu_assign_pointer(swhash->swevent_hlist, hlist); } swhash->hlist_refcount++; exit: mutex_unlock(&swhash->hlist_mutex); return err; } static int swevent_hlist_get(void) { int err, cpu, failed_cpu; mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; } } mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; swevent_hlist_put_cpu(cpu); } mutex_unlock(&pmus_lock); return err; } struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { u64 event_id = event->attr.config; WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(); } static struct pmu perf_cpu_clock; /* fwd declaration */ static struct pmu perf_task_clock; static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: event->attr.type = perf_cpu_clock.type; return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: event->attr.type = perf_task_clock.type; return -ENOENT; default: break; } if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { int err; err = swevent_hlist_get(); if (err) return err; static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } return 0; } static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; #ifdef CONFIG_EVENT_TRACING static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_destroy(event); } static int perf_tp_event_init(struct perf_event *event) { int err; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; /* * no branch sampling for tracepoint events */ if (has_branch_stack(event)) return -EOPNOTSUPP; err = perf_trace_init(event); if (err) return err; event->destroy = tp_perf_event_destroy; return 0; } static struct pmu perf_tracepoint = { .task_ctx_nr = perf_sw_context, .event_init = perf_tp_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, }; static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) event = event->parent; if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; } static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 0; /* * If exclude_kernel, only trace user-space tracepoints (uprobes) */ if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) return 0; return 1; } void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_event *event) { struct trace_entry *entry = record; if (event->attr.config != entry->type) return; /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; if (perf_tp_event_match(event, data, regs)) perf_swevent_event(event, count, data, regs); } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); struct pmu *pmu = &perf_tracepoint; struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { __perf_tp_event_target_task(count, record, regs, data, event); for_each_sibling_event(sibling, event) __perf_tp_event_target_task(count, record, regs, data, sibling); } } void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; struct perf_raw_record raw = { .frag = { .size = entry_size, .data = record, }, }; perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, &raw); perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) { perf_swevent_event(event, count, &data, regs); /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and * need to be re-computed for different sweveents. * Re-initialize data->sample_flags safely to avoid * the problem that next event skips preparing data * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); perf_sample_save_raw_data(&data, &raw); } } /* * If we got specified a target task, also iterate its context and * deliver this event there too. */ if (task && task != current) { struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; raw_spin_lock(&ctx->lock); perf_tp_event_target_task(count, record, regs, &data, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe * * The following values specify a reference counter (or semaphore in the * terminology of tools like dtrace, systemtap, etc.) Userspace Statically * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. * * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); #endif #ifdef CONFIG_KPROBE_EVENTS static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; static struct attribute_group kprobe_format_group = { .name = "format", .attrs = kprobe_attrs, }; static const struct attribute_group *kprobe_attr_groups[] = { &kprobe_format_group, NULL, }; static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_kprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) { int err; bool is_retprobe; if (event->attr.type != perf_kprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; err = perf_kprobe_init(event, is_retprobe); if (err) return err; event->destroy = perf_kprobe_destroy; return 0; } #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); static struct attribute *uprobe_attrs[] = { &format_attr_retprobe.attr, &format_attr_ref_ctr_offset.attr, NULL, }; static struct attribute_group uprobe_format_group = { .name = "format", .attrs = uprobe_attrs, }; static const struct attribute_group *uprobe_attr_groups[] = { &uprobe_format_group, NULL, }; static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, .event_init = perf_uprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) return -ENOENT; if (!perfmon_capable()) return -EACCES; /* * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; event->destroy = perf_uprobe_destroy; return 0; } #endif /* CONFIG_UPROBE_EVENTS */ static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif #ifdef CONFIG_UPROBE_EVENTS perf_pmu_register(&perf_uprobe, "uprobe", -1); #endif } static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); } #ifdef CONFIG_BPF_SYSCALL static void bpf_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { struct bpf_perf_event_data_kern ctx = { .data = data, .event = event, }; struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); if (prog) { perf_prepare_sample(data, event, regs); ret = bpf_prog_run(prog, &ctx); } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); if (!ret) return; event->orig_overflow_handler(event, data, regs); } static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; if (event->prog) return -EEXIST; if (prog->type != BPF_PROG_TYPE_PERF_EVENT) return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* * On perf_event with precise_ip, calling bpf_get_stack() * may trigger unwinder warnings and occasional crashes. * bpf_get_[stack|stackid] works around this issue by using * callchain attached to perf_sample_data. If the * perf_event does not full (kernel and user) callchain * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ return -EPROTO; } event->prog = prog; event->bpf_cookie = bpf_cookie; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; } static void perf_event_free_bpf_handler(struct perf_event *event) { struct bpf_prog *prog = event->prog; if (!prog) return; WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); event->prog = NULL; bpf_prog_put(prog); } #else static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static void perf_event_free_bpf_handler(struct perf_event *event) { } #endif /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() */ static inline bool perf_event_is_tracing(struct perf_event *event) { if (event->pmu == &perf_tracepoint) return true; #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; #endif #ifdef CONFIG_UPROBE_EVENTS if (event->pmu == &perf_uprobe) return true; #endif return false; } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) /* only uprobe programs are allowed to be sleepable */ return -EINVAL; /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) return -EACCES; } return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } perf_event_detach_bpf_prog(event); } #else static inline void perf_tp_register(void) { } static void perf_event_free_filter(struct perf_event *event) { } int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; } void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT void perf_bp_event(struct perf_event *bp, void *data) { struct perf_sample_data sample; struct pt_regs *regs = data; perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); } #endif /* * Allocate a new address filter */ static struct perf_addr_filter * perf_addr_filter_new(struct perf_event *event, struct list_head *filters) { int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); struct perf_addr_filter *filter; filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); if (!filter) return NULL; INIT_LIST_HEAD(&filter->entry); list_add_tail(&filter->entry, filters); return filter; } static void free_filters_list(struct list_head *filters) { struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { path_put(&filter->path); list_del(&filter->entry); kfree(filter); } } /* * Free existing address filters and optionally install new ones */ static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head) { unsigned long flags; LIST_HEAD(list); if (!has_addr_filter(event)) return; /* don't bother with children, they don't have their own filters */ if (event->parent) return; raw_spin_lock_irqsave(&event->addr_filters.lock, flags); list_splice_init(&event->addr_filters.list, &list); if (head) list_splice(head, &event->addr_filters.list); raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); free_filters_list(&list); } /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (perf_addr_filter_vma_adjust(filter, vma, fr)) return; } } /* * Update event's address range filters based on the * task's existing mappings, if any. */ static void perf_event_addr_filters_apply(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct task_struct *task = READ_ONCE(event->ctx->task); struct perf_addr_filter *filter; struct mm_struct *mm = NULL; unsigned int count = 0; unsigned long flags; /* * We may observe TASK_TOMBSTONE, which means that the event tear-down * will stop on the parent's child_mutex that our caller is also holding */ if (task == TASK_TOMBSTONE) return; if (ifh->nr_file_filters) { mm = get_task_mm(task); if (!mm) goto restart; mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { /* * Adjust base offset if the filter is associated to a * binary that needs to be mapped: */ event->addr_filter_ranges[count].start = 0; event->addr_filter_ranges[count].size = 0; perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); } else { event->addr_filter_ranges[count].start = filter->offset; event->addr_filter_ranges[count].size = filter->size; } count++; } event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { mmap_read_unlock(mm); mmput(mm); } restart: perf_event_stop(event, 1); } /* * Address range filtering: limiting the data to certain * instruction address ranges. Filters are ioctl()ed to us from * userspace as ascii strings. * * Filter string format: * * ACTION RANGE_SPEC * where ACTION is one of the * * "filter": limit the trace to this region * * "start": start tracing from this address * * "stop": stop tracing at this address/region; * RANGE_SPEC is * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * * if <size> is not specified or is zero, the range is treated as a single * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, IF_SRC_FILE, IF_SRC_KERNEL, IF_SRC_FILEADDR, IF_SRC_KERNELADDR, }; enum { IF_STATE_ACTION = 0, IF_STATE_SOURCE, IF_STATE_END, }; static const match_table_t if_tokens = { { IF_ACT_FILTER, "filter" }, { IF_ACT_START, "start" }, { IF_ACT_STOP, "stop" }, { IF_SRC_FILE, "%u/%u@%s" }, { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, { IF_ACT_NONE, NULL }, }; /* * Address filter string parser */ static int perf_event_parse_addr_filter(struct perf_event *event, char *fstr, struct list_head *filters) { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; int ret = -EINVAL; orig = fstr = kstrdup(fstr, GFP_KERNEL); if (!fstr) return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { static const enum perf_addr_filter_action_t actions[] = { [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, }; ret = -EINVAL; if (!*start) continue; /* filter definition begins */ if (state == IF_STATE_ACTION) { filter = perf_addr_filter_new(event, filters); if (!filter) goto fail; } token = match_token(start, if_tokens, args); switch (token) { case IF_ACT_FILTER: case IF_ACT_START: case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; filter->action = actions[token]; state = IF_STATE_SOURCE; break; case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail; *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) goto fail; } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; } } state = IF_STATE_END; break; default: goto fail; } /* * Filter definition is fully parsed, validate and install it. * Make sure that it doesn't contradict itself or the event's * attribute. */ if (state == IF_STATE_END) { ret = -EINVAL; /* * ACTION "filter" must have a non-zero length region * specified. */ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && !filter->size) goto fail; if (!kernel) { if (!filename) goto fail; /* * For now, we only support file-based filters * in per-task events; doing so for CPU-wide * events requires additional context switching * trickery, since same object code will be * mapped at different virtual addresses in * different processes. */ ret = -EOPNOTSUPP; if (!event->ctx->task) goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) goto fail; ret = -EINVAL; if (!filter->path.dentry || !S_ISREG(d_inode(filter->path.dentry) ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ kfree(filename); filename = NULL; state = IF_STATE_ACTION; filter = NULL; kernel = 0; } } if (state != IF_STATE_ACTION) goto fail; kfree(filename); kfree(orig); return 0; fail: kfree(filename); free_filters_list(filters); kfree(orig); return ret; } static int perf_event_set_addr_filter(struct perf_event *event, char *filter_str) { LIST_HEAD(filters); int ret; /* * Since this is called in perf_ioctl() path, we're already holding * ctx::mutex. */ lockdep_assert_held(&event->ctx->mutex); if (WARN_ON_ONCE(event->parent)) return -EINVAL; ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); if (ret) goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); /* install new filters */ perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; fail_free_filters: free_filters_list(&filters); fail_clear_files: event->addr_filters.nr_file_filters = 0; return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) { int ret = -EINVAL; char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); #ifdef CONFIG_EVENT_TRACING if (perf_event_is_tracing(event)) { struct perf_event_context *ctx = event->ctx; /* * Beware, here be dragons!! * * the tracepoint muck will deadlock against ctx->mutex, but * the tracepoint stuff does not actually need it. So * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we * already have a reference on ctx. * * This can result in event getting moved to a different ctx, * but that does not affect the tracepoint state. */ mutex_unlock(&ctx->mutex); ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); mutex_lock(&ctx->mutex); } else #endif if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); return ret; } /* * hrtimer based swevent callback */ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; struct perf_event *event; u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); if (event->state != PERF_EVENT_STATE_ACTIVE) return HRTIMER_NORESTART; event->pmu->read(event); perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs)) ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 period; if (!is_sampling_event(event)) return; period = local64_read(&hwc->period_left); if (period) { if (period < 0) period = 10000; local64_set(&hwc->period_left, 0); } else { period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); hrtimer_cancel(&hwc->hrtimer); } } static void perf_swevent_init_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!is_sampling_event(event)) return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hwc->hrtimer.function = perf_swevent_hrtimer; /* * Since hrtimers have a fixed rate, we can do a static freq->period * mapping and avoid the whole period adjust feedback stuff. */ if (event->attr.freq) { long freq = event->attr.sample_freq; event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } /* * Software event: cpu wall time clock */ static void cpu_clock_event_update(struct perf_event *event) { s64 prev; u64 now; now = local_clock(); prev = local64_xchg(&event->hw.prev_count, now); local64_add(now - prev, &event->count); } static void cpu_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void cpu_clock_event_del(struct perf_event *event, int flags) { cpu_clock_event_stop(event, flags); } static void cpu_clock_event_read(struct perf_event *event) { cpu_clock_event_update(event); } static int cpu_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, }; /* * Software event: task time clock */ static void task_clock_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; local64_add(delta, &event->count); } static void task_clock_event_start(struct perf_event *event, int flags) { local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); perf_event_update_userpage(event); return 0; } static void task_clock_event_del(struct perf_event *event, int flags) { task_clock_event_stop(event, PERF_EF_UPDATE); } static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); u64 delta = now - event->ctx->timestamp; u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } static int task_clock_event_init(struct perf_event *event) { if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; /* * no branch sampling for software events */ if (has_branch_stack(event)) return -EOPNOTSUPP; perf_swevent_init_hrtimer(event); return 0; } static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, }; static void perf_pmu_nop_void(struct pmu *pmu) { } static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) { } static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } static int perf_event_nop_int(struct perf_event *event, u64 value) { return 0; } static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { __this_cpu_write(nop_txn_flags, flags); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return 0; perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { unsigned int flags = __this_cpu_read(nop_txn_flags); __this_cpu_write(nop_txn_flags, 0); if (flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_enable(pmu); } static int perf_event_idx_default(struct perf_event *event) { return 0; } static void free_pmu_context(struct pmu *pmu) { free_percpu(pmu->cpu_pmu_context); } /* * Let userspace know that this PMU supports address range filtering: */ static ssize_t nr_addr_filters_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); static struct idr pmu_idr; static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pmu *pmu = dev_get_drvdata(dev); int timer, cpu, ret; ret = kstrtoint(buf, 0, &timer); if (ret) return ret; if (timer < 1) return -EINVAL; /* same value, noting to do */ if (timer == pmu->hrtimer_interval_ms) return count; mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, NULL, }; ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) { kfree(dev); } static int pmu_dev_alloc(struct pmu *pmu) { int ret = -ENOMEM; pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!pmu->dev) goto out; pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; ret = device_add(pmu->dev); if (ret) goto free_dev; /* For PMUs with address filters, throw in an extra attribute: */ if (pmu->nr_addr_filters) ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); if (ret) goto del_dev; if (pmu->attr_update) ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); if (ret) goto del_dev; out: return ret; del_dev: device_del(pmu->dev); free_dev: put_device(pmu->dev); goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; pmu->type = -1; if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { ret = -EINVAL; goto free_pdc; } pmu->name = name; if (type >= 0) max = type; ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; WARN_ON(type >= 0 && ret != type); type = ret; pmu->type = type; if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } if (!pmu->start_txn) { if (pmu->pmu_enable) { /* * If we have pmu_enable/pmu_disable calls, install * transaction stubs that use that to try and batch * hardware accesses. */ pmu->start_txn = perf_pmu_start_txn; pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } } if (!pmu->pmu_enable) { pmu->pmu_enable = perf_pmu_nop_void; pmu->pmu_disable = perf_pmu_nop_void; } if (!pmu->check_period) pmu->check_period = perf_event_nop_int; if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; free_dev: if (pmu->dev && pmu->dev != PMU_NULL_DEV) { device_del(pmu->dev); put_device(pmu->dev); } free_idr: idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); free_percpu(pmu->pmu_disable_count); idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); static inline bool has_extended_regs(struct perf_event *event) { return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); } static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; /* * A number of pmu->event_init() methods iterate the sibling_list to, * for example, validate if the group fits on the PMU. Therefore, * if this is a sibling event, acquire the ctx->mutex to protect * the sibling_list. */ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. */ ctx = perf_event_ctx_lock_nested(event->group_leader, SINGLE_DEPTH_NESTING); BUG_ON(!ctx); } event->pmu = pmu; ret = pmu->event_init(event); if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && has_extended_regs(event)) ret = -EOPNOTSUPP; if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && event_has_any_exclude_flag(event)) ret = -EINVAL; if (ret && event->destroy) event->destroy(event); } if (ret) module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; int idx, type, ret; struct pmu *pmu; idx = srcu_read_lock(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain * pmus overwrites event->attr.type to forward event to another pmu. */ event->orig_type = event->attr.type; /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; } /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { type = event->attr.config >> PERF_PMU_TYPE_SHIFT; if (!type) { type = PERF_TYPE_RAW; } else { extended_type = true; event->attr.config &= PERF_HW_EVENT_MASK; } } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) goto fail; ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; if (ret != -ENOENT) { pmu = ERR_PTR(ret); goto unlock; } } fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); return pmu; } static void attach_sb_event(struct perf_event *event) { struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); raw_spin_lock(&pel->lock); list_add_rcu(&event->sb_list, &pel->list); raw_spin_unlock(&pel->lock); } /* * We keep a list of all !task (and therefore per-cpu) events * that need to receive side-band records. * * This avoids having to scan all the various PMU per-cpu contexts * looking for them. */ static void account_pmu_sb_event(struct perf_event *event) { if (is_sb_event(event)) attach_sb_event(event); } /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { #ifdef CONFIG_NO_HZ_FULL /* Lock so we don't race with concurrent unaccount */ spin_lock(&nr_freq_lock); if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); spin_unlock(&nr_freq_lock); #endif } static void account_freq_event(void) { if (tick_nohz_full_enabled()) account_freq_event_nohz(); else atomic_inc(&nr_freq_events); } static void account_event(struct perf_event *event) { bool inc = false; if (event->parent) return; if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.build_id) atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); if (event->attr.cgroup) atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; } if (has_branch_stack(event)) inc = true; if (is_cgroup_event(event)) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); if (event->attr.text_poke) atomic_inc(&nr_text_poke_events); if (inc) { /* * We need the mutex here because static_branch_enable() * must complete *before* the perf_sched_count increment * becomes visible. */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; mutex_lock(&perf_sched_mutex); if (!atomic_read(&perf_sched_count)) { static_branch_enable(&perf_sched_events); /* * Guarantee that all CPUs observe they key change and * call the perf scheduling hooks before proceeding to * install events that need them. */ synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further * increments to by-pass the mutex. */ atomic_inc(&perf_sched_count); mutex_unlock(&perf_sched_mutex); } enabled: account_pmu_sb_event(event); } /* * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } if (attr->sigtrap && !task) { /* Requires a task: avoid signalling random tasks. */ return ERR_PTR(-EINVAL); } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); /* * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) group_leader = event; mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->active_list); init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; event->oncpu = -1; event->parent = parent_event; event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; if (parent_event) event->event_caps = parent_event->event_caps; if (task) { event->attach_state = PERF_ATTACH_TASK; /* * XXX pmu::event_init needs to know what task to account to * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ event->hw.target = get_task_struct(task); } event->clock = &local_clock; if (parent_event) event->clock = parent_event->clock; if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; } #endif } if (overflow_handler) { event->overflow_handler = overflow_handler; event->overflow_handler_context = context; } else if (is_write_backward(event)){ event->overflow_handler = perf_event_output_backward; event->overflow_handler_context = NULL; } else { event->overflow_handler = perf_event_output_forward; event->overflow_handler_context = NULL; } perf_event__state_init(event); pmu = NULL; hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); /* * We currently do not support PERF_SAMPLE_READ on inherited events. * See perf_output_read(). */ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } /* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; goto err_pmu; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) goto err_pmu; } err = exclusive_event_init(event); if (err) goto err_pmu; if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } /* * Clone the parent's vma offsets: they are valid until exec() * even if the mm is not shared with the parent. */ if (event->parent) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); memcpy(event->addr_filter_ranges, event->parent->addr_filter_ranges, pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } /* force hw sync on the address filters */ event->addr_filters_gen = 1; } if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } } err = security_perf_event_alloc(event); if (err) goto err_callchain_buffer; /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; err_callchain_buffer: if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); } err_addr_filters: kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); err_pmu: if (is_cgroup_event(event)) perf_detach_cgroup(event); if (event->destroy) event->destroy(event); module_put(pmu->module); err_ns: if (event->hw.target) put_task_struct(event->hw.target); call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr) { u32 size; int ret; /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; /* ABI compatibility quirk: */ if (!size) size = PERF_ATTR_SIZE_VER0; if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); if (ret) { if (ret == -E2BIG) goto err_size; return ret; } attr->size = size; if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) return -EINVAL; if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { u64 mask = attr->branch_sample_type; /* only using defined bits */ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) return -EINVAL; /* at least one branch bit must be set */ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { /* exclude_kernel checked on syscall entry */ if (!attr->exclude_kernel) mask |= PERF_SAMPLE_BRANCH_KERNEL; if (!attr->exclude_user) mask |= PERF_SAMPLE_BRANCH_USER; if (!attr->exclude_hv) mask |= PERF_SAMPLE_BRANCH_HV; /* * adjust user setting (for HW filter setup) */ attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { ret = perf_allow_kernel(attr); if (ret) return ret; } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); if (ret) return ret; } if (attr->sample_type & PERF_SAMPLE_STACK_USER) { if (!arch_perf_have_user_stack_dump()) return -ENOSYS; /* * We have __u32 type for the size, but so far * we can only use __u16 as maximum due to the * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) return -EINVAL; } if (!attr->sample_max_stack) attr->sample_max_stack = sysctl_perf_event_max_stack; if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); #ifndef CONFIG_CGROUP_PERF if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL; if (!attr->inherit && attr->inherit_thread) return -EINVAL; if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL; if (attr->sigtrap && !attr->remove_on_exec) return -EINVAL; out: return ret; err_size: put_user(sizeof(*attr), &uattr->size); ret = -E2BIG; goto out; } static void mutex_lock_double(struct mutex *a, struct mutex *b) { if (b < a) swap(a, b); mutex_lock(a); mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) { mutex_lock(&event->mmap_mutex); goto set; } /* don't allow circular references */ if (event == output_event) goto out; /* * Don't allow cross-cpu buffers */ if (output_event->cpu != event->cpu) goto out; /* * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* * Mixing clocks in the same buffer is trouble you don't need. */ if (output_event->clock != event->clock) goto out; /* * Either writing ring buffer from beginning or from end. * Mixing is not allowed. */ if (is_write_backward(output_event) != is_write_backward(event)) goto out; /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && event->pmu != output_event->pmu) goto out; /* * Hold both mmap_mutex to serialize against perf_mmap_close(). Since * output_event is already on rb->event_list, and the list iteration * restarts after every removal, it is guaranteed this new event is * observed *OR* if output_event is already removed, it's guaranteed we * observe !rb->mmap_count. */ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; /* did we race against perf_mmap_close() */ if (!atomic_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } } ring_buffer_attach(event, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); if (output_event) mutex_unlock(&output_event->mmap_mutex); out: return ret; } static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; switch (clk_id) { case CLOCK_MONOTONIC: event->clock = &ktime_get_mono_fast_ns; nmi_safe = true; break; case CLOCK_MONOTONIC_RAW: event->clock = &ktime_get_raw_fast_ns; nmi_safe = true; break; case CLOCK_REALTIME: event->clock = &ktime_get_real_ns; break; case CLOCK_BOOTTIME: event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: event->clock = &ktime_get_clocktai_ns; break; default: return -EINVAL; } if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) return -EINVAL; return 0; } static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; bool is_capable = perfmon_capable(); if (attr->sigtrap) { /* * perf_event_attr::sigtrap sends signals to the other task. * Require the current task to also have CAP_KILL. */ rcu_read_lock(); is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); rcu_read_unlock(); /* * If the required capabilities aren't available, checks for * ptrace permissions: upgrade to ATTACH, since sending signals * can effectively change the target task. */ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; } /* * Preserve ptrace permission check for backwards compatibility. The * ptrace check also includes checks that the current task and other * task have matching uids, and is therefore not done here explicitly. */ return is_capable || ptrace_may_access(task, ptrace_mode); } /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; int err; int f_flags = O_RDWR; int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); if (err) return err; /* Do we allow access to perf_event_open(2) ? */ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { err = perf_allow_kernel(&attr); if (err) return err; } if (attr.namespaces) { if (!perfmon_capable()) return -EACCES; } if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } else { if (attr.sample_period & (1ULL << 63)) return -EINVAL; } /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { err = perf_allow_kernel(&attr); if (err) return err; } /* REGS_INTR can leak data, lockdown must prevent this */ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { err = security_locked_down(LOCKDOWN_PERF); if (err) return err; } /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument * designates the cpu on which to monitor threads from that * cgroup. */ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; if (flags & PERF_FLAG_FD_CLOEXEC) f_flags |= O_CLOEXEC; event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; if (group_fd != -1) { err = perf_fget_light(group_fd, &group); if (err) goto err_fd; group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) group_leader = NULL; } if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); goto err_group_fd; } } if (task && group_leader && group_leader->attr.inherit != attr.inherit) { err = -EINVAL; goto err_task; } if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; } if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -EOPNOTSUPP; goto err_alloc; } } /* * Special case software events and allow them to be part of * any hardware group. */ pmu = event->pmu; if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) goto err_alloc; } if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; if (task) { err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_alloc; /* * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). */ err = -EACCES; if (!perf_check_permission(&attr, task)) goto err_cred; } /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_cred; } mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); if (!cpuctx->online) { err = -ENODEV; goto err_locked; } } if (group_leader) { err = -EINVAL; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) goto err_locked; /* * Make sure we're both events for the same CPU; * grouping events for different CPUs is broken; since * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) goto err_locked; /* * Make sure we're both on the same context; either task or cpu. */ if (group_leader->ctx != ctx) goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) goto err_locked; if (is_software_event(event) && !in_software_context(group_leader)) { /* * If the event is a sw event, but the group_leader * is on hw context. * * Allow the addition of software events to hw * groups, this is safe because software events * never fail to schedule. * * Note the comment that goes with struct * perf_event_pmu_context. */ pmu = group_leader->pmu_ctx->pmu; } else if (!is_software_event(event)) { if (is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to * the hardware context. */ move_group = 1; } /* Don't allow group of multiple hw events from different pmus */ if (!in_software_context(group_leader) && group_leader->pmu_ctx->pmu != pmu) goto err_locked; } } /* * Now that we're certain of the pmu; find the pmu_ctx. */ pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_locked; } event->pmu_ctx = pmu_ctx; if (output_event) { err = perf_event_set_output(event, output_event); if (err) goto err_context; } if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; goto err_context; } /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); event_file = NULL; goto err_context; } /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { perf_remove_from_context(group_leader, 0); put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); } /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group * (through the sibling list, which is still in-tact), we can * end up with siblings installed in the wrong context. * * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { sibling->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); } /* * Removing from the context ends up with disabled * event. What we want here is event in the initial * startup state, ready to be add into new context. */ group_leader->pmu_ctx = pmu_ctx; get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); } /* * Precalculate sample_data sizes; do while holding ctx::mutex such * that we're serialized against further additions and before * perf_install_in_context() which is the point the event is active and * can use these values. */ perf_event__header_size(event); perf_event__id_header_size(event); event->owner = current; perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); if (task) { up_read(&task->signal->exec_update_lock); put_task_struct(task); } mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in * perf_group_detach(). */ fdput(group); fd_install(event_fd, event_file); return event_fd; err_context: put_pmu_ctx(event->pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: free_event(event); err_task: if (task) put_task_struct(task); err_group_fd: fdput(group); err_fd: put_unused_fd(event_fd); return err; } /** * perf_event_create_kernel_counter * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) * @overflow_handler: callback to trigger when we hit the event * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t overflow_handler, void *context) { struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; struct pmu *pmu; int err; /* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ if (attr->aux_output) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; } /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; pmu = event->pmu; if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_unlock; } pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) { err = PTR_ERR(pmu_ctx); goto err_unlock; } event->pmu_ctx = pmu_ctx; if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu(). */ struct perf_cpu_context *cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; err_pmu_ctx: put_pmu_ctx(pmu_ctx); event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); static void __perf_pmu_remove(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct perf_event_groups *groups, struct list_head *events) { struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } } } static void __perf_pmu_install_event(struct pmu *pmu, struct perf_event_context *ctx, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); event->pmu_ctx = epc; if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); } static void __perf_pmu_install(struct perf_event_context *ctx, int cpu, struct pmu *pmu, struct list_head *events) { struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. * * Skip over group leaders and only install siblings on this first * pass, siblings will not get enabled without a leader, however a * leader will enable its siblings, even if those are still on the old * context. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); __perf_pmu_install_event(pmu, ctx, cpu, event); } } void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) { struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); if (!list_empty(&events)) { /* * Wait for the events to quiesce before re-instating them. */ synchronize_rcu(); __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); } mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; if (child_event->attr.inherit_stat) { struct task_struct *task = child_event->ctx->task; if (task && task != TASK_TOMBSTONE) perf_event_read_event(child_event, task); } child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); } static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent_event = event->parent; unsigned long detach_flags = 0; if (parent_event) { /* * Do not destroy the 'original' grouping; because of the * context switch optimization the original events could've * ended up in a random child task. * * If we were to destroy the original group, all group related * operations would cease to function properly after this * random child dies. * * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ detach_flags = DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } perf_remove_from_context(event, detach_flags); raw_spin_lock_irq(&ctx->lock); if (event->state > PERF_EVENT_STATE_EXIT) perf_event_set_state(event, PERF_EVENT_STATE_EXIT); raw_spin_unlock_irq(&ctx->lock); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); /* * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); free_event(event); put_event(parent_event); return; } /* * Parent events are governed by their filedesc, retain them. */ perf_event_wakeup(event); } static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); child_ctx = perf_pin_task_context(child); if (!child_ctx) return; /* * In order to reduce the amount of tricky in ctx tear-down, we hold * ctx::mutex over the entire thing. This serializes against almost * everything that wants to access the ctx. * * The exception is sys_perf_event_open() / * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ mutex_lock(&child_ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ clone_ctx = unclone_ctx(child_ctx); raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); } /* * When a child task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); /* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task * with child's task_ctx, which generates EXIT events for * child contexts and sets child->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) return; mutex_lock(&parent->child_mutex); list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); put_event(parent); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* * Free a context as created by inheritance by perf_event_init_task() below, * used by fork() in case of fail. * * Even though the task has never lived, the context and events have been * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; ctx = rcu_access_pointer(task->perf_event_ctxp); if (!ctx) return; mutex_lock(&ctx->mutex); raw_spin_lock_irq(&ctx->lock); /* * Destroy the task <-> ctx relation and mark the context dead. * * This is important because even though the task hasn't been * exposed yet the context has been (through child_list). */ RCU_INIT_POINTER(task->perf_event_ctxp, NULL); WRITE_ONCE(ctx->task, TASK_TOMBSTONE); put_task_struct(task); /* cannot be last */ raw_spin_unlock_irq(&ctx->lock); list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); /* * perf_event_release_kernel() could've stolen some of our * child events and still have them on its free_list. In that * case we must wait for these events to have been freed (in * particular all their references to this task must've been * dropped). * * Without this copy_process() will unconditionally free this * task (irrespective of its reference count) and * _free_event()'s put_task_struct(event->hw.target) will be a * use-after-free. * * Wait for all events to drop their context reference. */ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) { struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &perf_fops) { fput(file); return ERR_PTR(-EBADF); } return file; } const struct perf_event *perf_get_event(struct file *file) { if (file->f_op != &perf_fops) return ERR_PTR(-EINVAL); return file->private_data; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) return ERR_PTR(-EINVAL); return &event->attr; } /* * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success * - NULL for orphaned events * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event *group_leader, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; /* * Instead of creating recursive hierarchies of events, * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ if (parent_event->parent) parent_event = parent_event->parent; child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, group_leader, parent_event, NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against * perf_event_release_kernel(), such that either we must observe * is_orphaned_event() or they will observe us on the child_list. */ mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } get_ctx(child_ctx); /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; if (parent_event->attr.freq) { u64 sample_period = parent_event->hw.sample_period; struct hw_perf_event *hwc = &child_event->hw; hwc->sample_period = sample_period; hwc->last_period = sample_period; local64_set(&hwc->period_left, sample_period); } child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes */ perf_event__header_size(child_event); perf_event__id_header_size(child_event); /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Link this into the parent event's child list */ list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); return child_event; } /* * Inherits an event group. * * This will quietly suppress orphaned events; !inherit_event() is not an error. * This matches with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event_context *child_ctx) { struct perf_event *leader; struct perf_event *sub; struct perf_event *child_ctr; leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); /* * @leader can be NULL here because of is_orphaned_event(). In this * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } return 0; } /* * Creates the child task context and tries to inherit the event-group. * * Clears @inherited_all on !attr.inherited or error. Note that we'll leave * inherited_all set when we 'fail' to inherit an orphaned event; this is * consistent with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, u64 clone_flags, int *inherited_all) { struct perf_event_context *child_ctx; int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || /* Do not inherit if sigtrap and signal handlers were cleared. */ (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so * inherit events that have been marked for cloning. * First allocate and initialize a context for the * child. */ child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; child->perf_event_ctxp = child_ctx; } ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; return ret; } /* * Initialize the perf_event context in task_struct */ static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; unsigned long flags; int ret = 0; if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } /* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because * rotate_ctx() will change the list from interrupt context. */ raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, child, clone_flags, &inherited_all); if (ret) goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. * * Note that if the parent is a clone, the holding of * parent_ctx->lock avoids it from being uncloned. */ cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; } get_ctx(child_ctx->parent_ctx); } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); put_ctx(parent_ctx); return ret; } /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); ret = perf_event_init_context(child, clone_flags); if (ret) { perf_event_free_task(child); return ret; } return 0; } static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); cpuctx->heap = cpuctx->heap_default; } } static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); WARN_ON(!hlist); rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; struct perf_event *event; raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; // XXX simplify cpuctx->online mutex_lock(&pmus_lock); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else static void perf_event_exit_cpu_context(int cpu) { } #endif int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); cpumask_set_cpu(cpu, perf_online_mask); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); cpuctx->online = 1; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; } int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); return 0; } static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) { int cpu; for_each_online_cpu(cpu) perf_event_exit_cpu(cpu); return NOTIFY_OK; } /* * Run the perf reboot notifier at the very last possible moment so that * the generic watchdog code runs as long as possible. */ static struct notifier_block perf_reboot_notifier = { .notifier_call = perf_reboot, .priority = INT_MIN, }; void __init perf_event_init(void) { int ret; idr_init(&pmu_idr); perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. */ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) != 1024); } ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return 0; } EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { struct pmu *pmu; int ret; mutex_lock(&pmus_lock); ret = bus_register(&pmu_bus); if (ret) goto unlock; list_for_each_entry(pmu, &pmus, entry) { if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); } pmu_bus_running = 1; ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; } device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF static struct cgroup_subsys_state * perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } return &jc->css; } static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); } static int perf_cgroup_css_online(struct cgroup_subsys_state *css) { perf_event_cgroup(css->cgroup); return 0; } static int __perf_cgroup_move(void *info) { struct task_struct *task = info; preempt_disable(); perf_cgroup_switch(task); preempt_enable(); return 0; } static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can * always be filtered by cgroup2 path as long as perf_event * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); |
| 26 107 313 278 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_H #define _IPV6_H #include <uapi/linux/ipv6.h> #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) #define ipv6_authlen(p) (((p)->hdrlen+2) << 2) /* * This structure contains configuration options per IPv6 link. */ struct ipv6_devconf { __s32 forwarding; __s32 hop_limit; __s32 mtu6; __s32 accept_ra; __s32 accept_redirects; __s32 autoconf; __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; __s32 mldv2_unsolicited_report_interval; __s32 use_tempaddr; __s32 temp_valid_lft; __s32 temp_prefered_lft; __s32 regen_max_retry; __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; __s32 accept_ra_min_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif __s32 proxy_ndp; __s32 accept_source_route; __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE atomic_t mc_forwarding; #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; __s32 accept_untracked_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; __s32 keep_addr_on_down; __s32 seg6_enabled; #ifdef CONFIG_IPV6_SEG6_HMAC __s32 seg6_require_hmac; #endif __u32 enhanced_dad; __u32 addr_gen_mode; __s32 disable_policy; __s32 ndisc_tclass; __s32 rpl_seg_enabled; __u32 ioam6_id; __u32 ioam6_id_wide; __u8 ioam6_enabled; __u8 ndisc_evict_nocarrier; struct ctl_table_header *sysctl_header; }; struct ipv6_params { __s32 disable_ipv6; __s32 autoconf; }; extern struct ipv6_params ipv6_defaults; #include <linux/tcp.h> #include <linux/udp.h> #include <net/inet_sock.h> static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_network_header(skb); } static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_inner_network_header(skb); } static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_transport_header(skb); } static inline unsigned int ipv6_transport_len(const struct sk_buff *skb) { return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) - skb_network_header_len(skb); } /* This structure contains results of exthdrs parsing as offsets from skb->nh. */ struct inet6_skb_parm { int iif; __be16 ra; __u16 dst0; __u16 srcrt; __u16 dst1; __u16 lastopt; __u16 nhoff; __u16 flags; #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) __u16 dsthao; #endif __u16 frag_max_size; __u16 srhoff; #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 #define IP6SKB_FAKEJUMBO 512 #define IP6SKB_MULTIPATH 1024 }; #if defined(CONFIG_NET_L3_MASTER_DEV) static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } #endif #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) #define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb)) static inline int inet6_iif(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } static inline bool inet6_is_jumbogram(const struct sk_buff *skb) { return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); } /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline int inet6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) return IP6CB(skb)->iif; #endif return 0; } struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; struct ipv6_mc_socklist; struct ipv6_ac_socklist; struct ipv6_fl_socklist; struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; }; /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES const struct in6_addr *saddr_cache; #endif __be32 flow_label; __u32 frag_size; /* * Packed in 16bits. * Omit one shift by putting the signed field at MSB. */ #if defined(__BIG_ENDIAN_BITFIELD) __s16 hop_limit:9; __u16 __unused_1:7; #else __u16 __unused_1:7; __s16 hop_limit:9; #endif #if defined(__BIG_ENDIAN_BITFIELD) /* Packed in 16bits. */ __s16 mcast_hops:9; __u16 __unused_2:6, mc_loop:1; #else __u16 mc_loop:1, __unused_2:6; __s16 mcast_hops:9; #endif int ucast_oif; int mcast_oif; /* pktoption flags */ union { struct { __u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, rxhlim:1, rxohlim:1, hopopts:1, ohopopts:1, dstopts:1, odstopts:1, rxflow:1, rxtclass:1, rxpmtu:1, rxorigdstaddr:1, recvfragsize:1; /* 1 bits hole */ } bits; __u16 all; } rxopt; /* sockopt flags */ __u16 recverr:1, sndflow:1, repflow:1, pmtudisc:3, padding:1, /* 1 bit hole */ srcprefs:3, /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ dontfrag:1, autoflowlabel:1, autoflowlabel_set:1, mc_all:1, recverr_rfc4884:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; __u32 dst_cookie; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; struct ipv6_txoptions __rcu *opt; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct inet6_cork cork; }; /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */ struct raw6_sock { /* inet_sock has to be the first member of raw6_sock */ struct inet_sock inet; __u32 checksum; /* perform checksum */ __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; struct ipv6_pinfo inet6; }; struct udp6_sock { struct udp_sock udp; struct ipv6_pinfo inet6; }; struct tcp6_sock { struct tcp_sock tcp; struct ipv6_pinfo inet6; }; extern int inet6_sk_rebuild_header(struct sock *sk); struct tcp6_timewait_sock { struct tcp_timewait_sock tcp6tw_tcp; }; #if IS_ENABLED(CONFIG_IPV6) bool ipv6_mod_enabled(void); static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL; } #define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk) #define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ inet6_sk(sk)->rxopt.bits.rxinfo) static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { if (sk->sk_family == AF_INET6) return &sk->sk_v6_rcv_saddr; return NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) { /* ipv6only field is at same position for timewait and other sockets */ return ipv6_only_sock(sk); } #else #define ipv6_only_sock(sk) 0 #define ipv6_sk_rxinfo(sk) 0 static inline bool ipv6_mod_enabled(void) { return false; } static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { return NULL; } static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } #define inet6_rcv_saddr(__sk) NULL #define inet_v6_ipv6only(__sk) 0 #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _IPV6_H */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver for Alauda-based card readers * * Current development and maintenance by: * (c) 2005 Daniel Drake <dsd@gentoo.org> * * The 'Alauda' is a chip manufacturered by RATOC for OEM use. * * Alauda implements a vendor-specific command set to access two media reader * ports (XD, SmartMedia). This driver converts SCSI commands to the commands * which are accepted by these devices. * * The driver was developed through reverse-engineering, with the help of the * sddr09 driver which has many similarities, and with some help from the * (very old) vendor-supplied GPL sma03 driver. * * For protocol info, see http://alauda.sourceforge.net */ #include <linux/module.h> #include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "scsiglue.h" #define DRV_NAME "ums-alauda" MODULE_DESCRIPTION("Driver for Alauda-based card readers"); MODULE_AUTHOR("Daniel Drake <dsd@gentoo.org>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS(USB_STORAGE); /* * Status bytes */ #define ALAUDA_STATUS_ERROR 0x01 #define ALAUDA_STATUS_READY 0x40 /* * Control opcodes (for request field) */ #define ALAUDA_GET_XD_MEDIA_STATUS 0x08 #define ALAUDA_GET_SM_MEDIA_STATUS 0x98 #define ALAUDA_ACK_XD_MEDIA_CHANGE 0x0a #define ALAUDA_ACK_SM_MEDIA_CHANGE 0x9a #define ALAUDA_GET_XD_MEDIA_SIG 0x86 #define ALAUDA_GET_SM_MEDIA_SIG 0x96 /* * Bulk command identity (byte 0) */ #define ALAUDA_BULK_CMD 0x40 /* * Bulk opcodes (byte 1) */ #define ALAUDA_BULK_GET_REDU_DATA 0x85 #define ALAUDA_BULK_READ_BLOCK 0x94 #define ALAUDA_BULK_ERASE_BLOCK 0xa3 #define ALAUDA_BULK_WRITE_BLOCK 0xb4 #define ALAUDA_BULK_GET_STATUS2 0xb7 #define ALAUDA_BULK_RESET_MEDIA 0xe0 /* * Port to operate on (byte 8) */ #define ALAUDA_PORT_XD 0x00 #define ALAUDA_PORT_SM 0x01 /* * LBA and PBA are unsigned ints. Special values. */ #define UNDEF 0xffff #define SPARE 0xfffe #define UNUSABLE 0xfffd struct alauda_media_info { unsigned long capacity; /* total media size in bytes */ unsigned int pagesize; /* page size in bytes */ unsigned int blocksize; /* number of pages per block */ unsigned int uzonesize; /* number of usable blocks per zone */ unsigned int zonesize; /* number of blocks per zone */ unsigned int blockmask; /* mask to get page from address */ unsigned char pageshift; unsigned char blockshift; unsigned char zoneshift; u16 **lba_to_pba; /* logical to physical block map */ u16 **pba_to_lba; /* physical to logical block map */ }; struct alauda_info { struct alauda_media_info port[2]; int wr_ep; /* endpoint to write data out of */ unsigned char sense_key; unsigned long sense_asc; /* additional sense code */ unsigned long sense_ascq; /* additional sense code qualifier */ }; #define short_pack(lsb,msb) ( ((u16)(lsb)) | ( ((u16)(msb))<<8 ) ) #define LSB_of(s) ((s)&0xFF) #define MSB_of(s) ((s)>>8) #define MEDIA_PORT(us) us->srb->device->lun #define MEDIA_INFO(us) ((struct alauda_info *)us->extra)->port[MEDIA_PORT(us)] #define PBA_LO(pba) ((pba & 0xF) << 5) #define PBA_HI(pba) (pba >> 3) #define PBA_ZONE(pba) (pba >> 11) static int init_alauda(struct us_data *us); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static struct usb_device_id alauda_usb_ids[] = { # include "unusual_alauda.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, alauda_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static struct us_unusual_dev alauda_unusual_dev_list[] = { # include "unusual_alauda.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV /* * Media handling */ struct alauda_card_info { unsigned char id; /* id byte */ unsigned char chipshift; /* 1<<cs bytes total capacity */ unsigned char pageshift; /* 1<<ps bytes in a page */ unsigned char blockshift; /* 1<<bs pages per block */ unsigned char zoneshift; /* 1<<zs blocks per zone */ }; static struct alauda_card_info alauda_card_ids[] = { /* NAND flash */ { 0x6e, 20, 8, 4, 8}, /* 1 MB */ { 0xe8, 20, 8, 4, 8}, /* 1 MB */ { 0xec, 20, 8, 4, 8}, /* 1 MB */ { 0x64, 21, 8, 4, 9}, /* 2 MB */ { 0xea, 21, 8, 4, 9}, /* 2 MB */ { 0x6b, 22, 9, 4, 9}, /* 4 MB */ { 0xe3, 22, 9, 4, 9}, /* 4 MB */ { 0xe5, 22, 9, 4, 9}, /* 4 MB */ { 0xe6, 23, 9, 4, 10}, /* 8 MB */ { 0x73, 24, 9, 5, 10}, /* 16 MB */ { 0x75, 25, 9, 5, 10}, /* 32 MB */ { 0x76, 26, 9, 5, 10}, /* 64 MB */ { 0x79, 27, 9, 5, 10}, /* 128 MB */ { 0x71, 28, 9, 5, 10}, /* 256 MB */ /* MASK ROM */ { 0x5d, 21, 9, 4, 8}, /* 2 MB */ { 0xd5, 22, 9, 4, 9}, /* 4 MB */ { 0xd6, 23, 9, 4, 10}, /* 8 MB */ { 0x57, 24, 9, 4, 11}, /* 16 MB */ { 0x58, 25, 9, 4, 12}, /* 32 MB */ { 0,} }; static struct alauda_card_info *alauda_card_find_id(unsigned char id) { int i; for (i = 0; alauda_card_ids[i].id != 0; i++) if (alauda_card_ids[i].id == id) return &(alauda_card_ids[i]); return NULL; } /* * ECC computation. */ static unsigned char parity[256]; static unsigned char ecc2[256]; static void nand_init_ecc(void) { int i, j, a; parity[0] = 0; for (i = 1; i < 256; i++) parity[i] = (parity[i&(i-1)] ^ 1); for (i = 0; i < 256; i++) { a = 0; for (j = 0; j < 8; j++) { if (i & (1<<j)) { if ((j & 1) == 0) a ^= 0x04; if ((j & 2) == 0) a ^= 0x10; if ((j & 4) == 0) a ^= 0x40; } } ecc2[i] = ~(a ^ (a<<1) ^ (parity[i] ? 0xa8 : 0)); } } /* compute 3-byte ecc on 256 bytes */ static void nand_compute_ecc(unsigned char *data, unsigned char *ecc) { int i, j, a; unsigned char par = 0, bit, bits[8] = {0}; /* collect 16 checksum bits */ for (i = 0; i < 256; i++) { par ^= data[i]; bit = parity[data[i]]; for (j = 0; j < 8; j++) if ((i & (1<<j)) == 0) bits[j] ^= bit; } /* put 4+4+4 = 12 bits in the ecc */ a = (bits[3] << 6) + (bits[2] << 4) + (bits[1] << 2) + bits[0]; ecc[0] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0)); a = (bits[7] << 6) + (bits[6] << 4) + (bits[5] << 2) + bits[4]; ecc[1] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0)); ecc[2] = ecc2[par]; } static int nand_compare_ecc(unsigned char *data, unsigned char *ecc) { return (data[0] == ecc[0] && data[1] == ecc[1] && data[2] == ecc[2]); } static void nand_store_ecc(unsigned char *data, unsigned char *ecc) { memcpy(data, ecc, 3); } /* * Alauda driver */ /* * Forget our PBA <---> LBA mappings for a particular port */ static void alauda_free_maps (struct alauda_media_info *media_info) { unsigned int shift = media_info->zoneshift + media_info->blockshift + media_info->pageshift; unsigned int num_zones = media_info->capacity >> shift; unsigned int i; if (media_info->lba_to_pba != NULL) for (i = 0; i < num_zones; i++) { kfree(media_info->lba_to_pba[i]); media_info->lba_to_pba[i] = NULL; } if (media_info->pba_to_lba != NULL) for (i = 0; i < num_zones; i++) { kfree(media_info->pba_to_lba[i]); media_info->pba_to_lba[i] = NULL; } } /* * Returns 2 bytes of status data * The first byte describes media status, and second byte describes door status */ static int alauda_get_media_status(struct us_data *us, unsigned char *data) { int rc; unsigned char command; if (MEDIA_PORT(us) == ALAUDA_PORT_XD) command = ALAUDA_GET_XD_MEDIA_STATUS; else command = ALAUDA_GET_SM_MEDIA_STATUS; rc = usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, command, 0xc0, 0, 1, data, 2); if (rc == USB_STOR_XFER_GOOD) usb_stor_dbg(us, "Media status %02X %02X\n", data[0], data[1]); return rc; } /* * Clears the "media was changed" bit so that we know when it changes again * in the future. */ static int alauda_ack_media(struct us_data *us) { unsigned char command; if (MEDIA_PORT(us) == ALAUDA_PORT_XD) command = ALAUDA_ACK_XD_MEDIA_CHANGE; else command = ALAUDA_ACK_SM_MEDIA_CHANGE; return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, command, 0x40, 0, 1, NULL, 0); } /* * Retrieves a 4-byte media signature, which indicates manufacturer, capacity, * and some other details. */ static int alauda_get_media_signature(struct us_data *us, unsigned char *data) { unsigned char command; if (MEDIA_PORT(us) == ALAUDA_PORT_XD) command = ALAUDA_GET_XD_MEDIA_SIG; else command = ALAUDA_GET_SM_MEDIA_SIG; return usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, command, 0xc0, 0, 0, data, 4); } /* * Resets the media status (but not the whole device?) */ static int alauda_reset_media(struct us_data *us) { unsigned char *command = us->iobuf; memset(command, 0, 9); command[0] = ALAUDA_BULK_CMD; command[1] = ALAUDA_BULK_RESET_MEDIA; command[8] = MEDIA_PORT(us); return usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); } /* * Examines the media and deduces capacity, etc. */ static int alauda_init_media(struct us_data *us) { unsigned char *data = us->iobuf; int ready = 0; struct alauda_card_info *media_info; unsigned int num_zones; while (ready == 0) { msleep(20); if (alauda_get_media_status(us, data) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (data[0] & 0x10) ready = 1; } usb_stor_dbg(us, "We are ready for action!\n"); if (alauda_ack_media(us) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; msleep(10); if (alauda_get_media_status(us, data) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (data[0] != 0x14) { usb_stor_dbg(us, "Media not ready after ack\n"); return USB_STOR_TRANSPORT_ERROR; } if (alauda_get_media_signature(us, data) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; usb_stor_dbg(us, "Media signature: %4ph\n", data); media_info = alauda_card_find_id(data[1]); if (media_info == NULL) { pr_warn("alauda_init_media: Unrecognised media signature: %4ph\n", data); return USB_STOR_TRANSPORT_ERROR; } MEDIA_INFO(us).capacity = 1 << media_info->chipshift; usb_stor_dbg(us, "Found media with capacity: %ldMB\n", MEDIA_INFO(us).capacity >> 20); MEDIA_INFO(us).pageshift = media_info->pageshift; MEDIA_INFO(us).blockshift = media_info->blockshift; MEDIA_INFO(us).zoneshift = media_info->zoneshift; MEDIA_INFO(us).pagesize = 1 << media_info->pageshift; MEDIA_INFO(us).blocksize = 1 << media_info->blockshift; MEDIA_INFO(us).zonesize = 1 << media_info->zoneshift; MEDIA_INFO(us).uzonesize = ((1 << media_info->zoneshift) / 128) * 125; MEDIA_INFO(us).blockmask = MEDIA_INFO(us).blocksize - 1; num_zones = MEDIA_INFO(us).capacity >> (MEDIA_INFO(us).zoneshift + MEDIA_INFO(us).blockshift + MEDIA_INFO(us).pageshift); MEDIA_INFO(us).pba_to_lba = kcalloc(num_zones, sizeof(u16*), GFP_NOIO); MEDIA_INFO(us).lba_to_pba = kcalloc(num_zones, sizeof(u16*), GFP_NOIO); if (MEDIA_INFO(us).pba_to_lba == NULL || MEDIA_INFO(us).lba_to_pba == NULL) return USB_STOR_TRANSPORT_ERROR; if (alauda_reset_media(us) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return USB_STOR_TRANSPORT_GOOD; } /* * Examines the media status and does the right thing when the media has gone, * appeared, or changed. */ static int alauda_check_media(struct us_data *us) { struct alauda_info *info = (struct alauda_info *) us->extra; unsigned char *status = us->iobuf; int rc; rc = alauda_get_media_status(us, status); if (rc != USB_STOR_XFER_GOOD) { status[0] = 0xF0; /* Pretend there's no media */ status[1] = 0; } /* Check for no media or door open */ if ((status[0] & 0x80) || ((status[0] & 0x1F) == 0x10) || ((status[1] & 0x01) == 0)) { usb_stor_dbg(us, "No media, or door open\n"); alauda_free_maps(&MEDIA_INFO(us)); info->sense_key = 0x02; info->sense_asc = 0x3A; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } /* Check for media change */ if (status[0] & 0x08) { usb_stor_dbg(us, "Media change detected\n"); alauda_free_maps(&MEDIA_INFO(us)); alauda_init_media(us); info->sense_key = UNIT_ATTENTION; info->sense_asc = 0x28; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } return USB_STOR_TRANSPORT_GOOD; } /* * Checks the status from the 2nd status register * Returns 3 bytes of status data, only the first is known */ static int alauda_check_status2(struct us_data *us) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_GET_STATUS2, 0, 0, 0, 0, 3, 0, MEDIA_PORT(us) }; unsigned char data[3]; rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; rc = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, 3, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; usb_stor_dbg(us, "%3ph\n", data); if (data[0] & ALAUDA_STATUS_ERROR) return USB_STOR_XFER_ERROR; return USB_STOR_XFER_GOOD; } /* * Gets the redundancy data for the first page of a PBA * Returns 16 bytes. */ static int alauda_get_redu_data(struct us_data *us, u16 pba, unsigned char *data) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_GET_REDU_DATA, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 0, 0, MEDIA_PORT(us) }; rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, 16, NULL); } /* * Finds the first unused PBA in a zone * Returns the absolute PBA of an unused PBA, or 0 if none found. */ static u16 alauda_find_unused_pba(struct alauda_media_info *info, unsigned int zone) { u16 *pba_to_lba = info->pba_to_lba[zone]; unsigned int i; for (i = 0; i < info->zonesize; i++) if (pba_to_lba[i] == UNDEF) return (zone << info->zoneshift) + i; return 0; } /* * Reads the redundancy data for all PBA's in a zone * Produces lba <--> pba mappings */ static int alauda_read_map(struct us_data *us, unsigned int zone) { unsigned char *data = us->iobuf; int result; int i, j; unsigned int zonesize = MEDIA_INFO(us).zonesize; unsigned int uzonesize = MEDIA_INFO(us).uzonesize; unsigned int lba_offset, lba_real, blocknum; unsigned int zone_base_lba = zone * uzonesize; unsigned int zone_base_pba = zone * zonesize; u16 *lba_to_pba = kcalloc(zonesize, sizeof(u16), GFP_NOIO); u16 *pba_to_lba = kcalloc(zonesize, sizeof(u16), GFP_NOIO); if (lba_to_pba == NULL || pba_to_lba == NULL) { result = USB_STOR_TRANSPORT_ERROR; goto error; } usb_stor_dbg(us, "Mapping blocks for zone %d\n", zone); /* 1024 PBA's per zone */ for (i = 0; i < zonesize; i++) lba_to_pba[i] = pba_to_lba[i] = UNDEF; for (i = 0; i < zonesize; i++) { blocknum = zone_base_pba + i; result = alauda_get_redu_data(us, blocknum, data); if (result != USB_STOR_XFER_GOOD) { result = USB_STOR_TRANSPORT_ERROR; goto error; } /* special PBAs have control field 0^16 */ for (j = 0; j < 16; j++) if (data[j] != 0) goto nonz; pba_to_lba[i] = UNUSABLE; usb_stor_dbg(us, "PBA %d has no logical mapping\n", blocknum); continue; nonz: /* unwritten PBAs have control field FF^16 */ for (j = 0; j < 16; j++) if (data[j] != 0xff) goto nonff; continue; nonff: /* normal PBAs start with six FFs */ if (j < 6) { usb_stor_dbg(us, "PBA %d has no logical mapping: reserved area = %02X%02X%02X%02X data status %02X block status %02X\n", blocknum, data[0], data[1], data[2], data[3], data[4], data[5]); pba_to_lba[i] = UNUSABLE; continue; } if ((data[6] >> 4) != 0x01) { usb_stor_dbg(us, "PBA %d has invalid address field %02X%02X/%02X%02X\n", blocknum, data[6], data[7], data[11], data[12]); pba_to_lba[i] = UNUSABLE; continue; } /* check even parity */ if (parity[data[6] ^ data[7]]) { printk(KERN_WARNING "alauda_read_map: Bad parity in LBA for block %d" " (%02X %02X)\n", i, data[6], data[7]); pba_to_lba[i] = UNUSABLE; continue; } lba_offset = short_pack(data[7], data[6]); lba_offset = (lba_offset & 0x07FF) >> 1; lba_real = lba_offset + zone_base_lba; /* * Every 1024 physical blocks ("zone"), the LBA numbers * go back to zero, but are within a higher block of LBA's. * Also, there is a maximum of 1000 LBA's per zone. * In other words, in PBA 1024-2047 you will find LBA 0-999 * which are really LBA 1000-1999. This allows for 24 bad * or special physical blocks per zone. */ if (lba_offset >= uzonesize) { printk(KERN_WARNING "alauda_read_map: Bad low LBA %d for block %d\n", lba_real, blocknum); continue; } if (lba_to_pba[lba_offset] != UNDEF) { printk(KERN_WARNING "alauda_read_map: " "LBA %d seen for PBA %d and %d\n", lba_real, lba_to_pba[lba_offset], blocknum); continue; } pba_to_lba[i] = lba_real; lba_to_pba[lba_offset] = blocknum; continue; } MEDIA_INFO(us).lba_to_pba[zone] = lba_to_pba; MEDIA_INFO(us).pba_to_lba[zone] = pba_to_lba; result = 0; goto out; error: kfree(lba_to_pba); kfree(pba_to_lba); out: return result; } /* * Checks to see whether we have already mapped a certain zone * If we haven't, the map is generated */ static void alauda_ensure_map_for_zone(struct us_data *us, unsigned int zone) { if (MEDIA_INFO(us).lba_to_pba[zone] == NULL || MEDIA_INFO(us).pba_to_lba[zone] == NULL) alauda_read_map(us, zone); } /* * Erases an entire block */ static int alauda_erase_block(struct us_data *us, u16 pba) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_ERASE_BLOCK, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 0x02, 0, MEDIA_PORT(us) }; unsigned char buf[2]; usb_stor_dbg(us, "Erasing PBA %d\n", pba); rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; rc = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, buf, 2, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; usb_stor_dbg(us, "Erase result: %02X %02X\n", buf[0], buf[1]); return rc; } /* * Reads data from a certain offset page inside a PBA, including interleaved * redundancy data. Returns (pagesize+64)*pages bytes in data. */ static int alauda_read_block_raw(struct us_data *us, u16 pba, unsigned int page, unsigned int pages, unsigned char *data) { int rc; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_READ_BLOCK, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba) + page, pages, 0, MEDIA_PORT(us) }; usb_stor_dbg(us, "pba %d page %d count %d\n", pba, page, pages); rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; return usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, (MEDIA_INFO(us).pagesize + 64) * pages, NULL); } /* * Reads data from a certain offset page inside a PBA, excluding redundancy * data. Returns pagesize*pages bytes in data. Note that data must be big enough * to hold (pagesize+64)*pages bytes of data, but you can ignore those 'extra' * trailing bytes outside this function. */ static int alauda_read_block(struct us_data *us, u16 pba, unsigned int page, unsigned int pages, unsigned char *data) { int i, rc; unsigned int pagesize = MEDIA_INFO(us).pagesize; rc = alauda_read_block_raw(us, pba, page, pages, data); if (rc != USB_STOR_XFER_GOOD) return rc; /* Cut out the redundancy data */ for (i = 0; i < pages; i++) { int dest_offset = i * pagesize; int src_offset = i * (pagesize + 64); memmove(data + dest_offset, data + src_offset, pagesize); } return rc; } /* * Writes an entire block of data and checks status after write. * Redundancy data must be already included in data. Data should be * (pagesize+64)*blocksize bytes in length. */ static int alauda_write_block(struct us_data *us, u16 pba, unsigned char *data) { int rc; struct alauda_info *info = (struct alauda_info *) us->extra; unsigned char command[] = { ALAUDA_BULK_CMD, ALAUDA_BULK_WRITE_BLOCK, PBA_HI(pba), PBA_ZONE(pba), 0, PBA_LO(pba), 32, 0, MEDIA_PORT(us) }; usb_stor_dbg(us, "pba %d\n", pba); rc = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, command, 9, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; rc = usb_stor_bulk_transfer_buf(us, info->wr_ep, data, (MEDIA_INFO(us).pagesize + 64) * MEDIA_INFO(us).blocksize, NULL); if (rc != USB_STOR_XFER_GOOD) return rc; return alauda_check_status2(us); } /* * Write some data to a specific LBA. */ static int alauda_write_lba(struct us_data *us, u16 lba, unsigned int page, unsigned int pages, unsigned char *ptr, unsigned char *blockbuffer) { u16 pba, lbap, new_pba; unsigned char *bptr, *cptr, *xptr; unsigned char ecc[3]; int i, result; unsigned int uzonesize = MEDIA_INFO(us).uzonesize; unsigned int zonesize = MEDIA_INFO(us).zonesize; unsigned int pagesize = MEDIA_INFO(us).pagesize; unsigned int blocksize = MEDIA_INFO(us).blocksize; unsigned int lba_offset = lba % uzonesize; unsigned int new_pba_offset; unsigned int zone = lba / uzonesize; alauda_ensure_map_for_zone(us, zone); pba = MEDIA_INFO(us).lba_to_pba[zone][lba_offset]; if (pba == 1) { /* * Maybe it is impossible to write to PBA 1. * Fake success, but don't do anything. */ printk(KERN_WARNING "alauda_write_lba: avoid writing to pba 1\n"); return USB_STOR_TRANSPORT_GOOD; } new_pba = alauda_find_unused_pba(&MEDIA_INFO(us), zone); if (!new_pba) { printk(KERN_WARNING "alauda_write_lba: Out of unused blocks\n"); return USB_STOR_TRANSPORT_ERROR; } /* read old contents */ if (pba != UNDEF) { result = alauda_read_block_raw(us, pba, 0, blocksize, blockbuffer); if (result != USB_STOR_XFER_GOOD) return result; } else { memset(blockbuffer, 0, blocksize * (pagesize + 64)); } lbap = (lba_offset << 1) | 0x1000; if (parity[MSB_of(lbap) ^ LSB_of(lbap)]) lbap ^= 1; /* check old contents and fill lba */ for (i = 0; i < blocksize; i++) { bptr = blockbuffer + (i * (pagesize + 64)); cptr = bptr + pagesize; nand_compute_ecc(bptr, ecc); if (!nand_compare_ecc(cptr+13, ecc)) { usb_stor_dbg(us, "Warning: bad ecc in page %d- of pba %d\n", i, pba); nand_store_ecc(cptr+13, ecc); } nand_compute_ecc(bptr + (pagesize / 2), ecc); if (!nand_compare_ecc(cptr+8, ecc)) { usb_stor_dbg(us, "Warning: bad ecc in page %d+ of pba %d\n", i, pba); nand_store_ecc(cptr+8, ecc); } cptr[6] = cptr[11] = MSB_of(lbap); cptr[7] = cptr[12] = LSB_of(lbap); } /* copy in new stuff and compute ECC */ xptr = ptr; for (i = page; i < page+pages; i++) { bptr = blockbuffer + (i * (pagesize + 64)); cptr = bptr + pagesize; memcpy(bptr, xptr, pagesize); xptr += pagesize; nand_compute_ecc(bptr, ecc); nand_store_ecc(cptr+13, ecc); nand_compute_ecc(bptr + (pagesize / 2), ecc); nand_store_ecc(cptr+8, ecc); } result = alauda_write_block(us, new_pba, blockbuffer); if (result != USB_STOR_XFER_GOOD) return result; new_pba_offset = new_pba - (zone * zonesize); MEDIA_INFO(us).pba_to_lba[zone][new_pba_offset] = lba; MEDIA_INFO(us).lba_to_pba[zone][lba_offset] = new_pba; usb_stor_dbg(us, "Remapped LBA %d to PBA %d\n", lba, new_pba); if (pba != UNDEF) { unsigned int pba_offset = pba - (zone * zonesize); result = alauda_erase_block(us, pba); if (result != USB_STOR_XFER_GOOD) return result; MEDIA_INFO(us).pba_to_lba[zone][pba_offset] = UNDEF; } return USB_STOR_TRANSPORT_GOOD; } /* * Read data from a specific sector address */ static int alauda_read_data(struct us_data *us, unsigned long address, unsigned int sectors) { unsigned char *buffer; u16 lba, max_lba; unsigned int page, len, offset; unsigned int blockshift = MEDIA_INFO(us).blockshift; unsigned int pageshift = MEDIA_INFO(us).pageshift; unsigned int blocksize = MEDIA_INFO(us).blocksize; unsigned int pagesize = MEDIA_INFO(us).pagesize; unsigned int uzonesize = MEDIA_INFO(us).uzonesize; struct scatterlist *sg; int result; /* * Since we only read in one block at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. * We make this buffer big enough to hold temporary redundancy data, * which we use when reading the data blocks. */ len = min(sectors, blocksize) * (pagesize + 64); buffer = kmalloc(len, GFP_NOIO); if (!buffer) return USB_STOR_TRANSPORT_ERROR; /* Figure out the initial LBA and page */ lba = address >> blockshift; page = (address & MEDIA_INFO(us).blockmask); max_lba = MEDIA_INFO(us).capacity >> (blockshift + pageshift); result = USB_STOR_TRANSPORT_GOOD; offset = 0; sg = NULL; while (sectors > 0) { unsigned int zone = lba / uzonesize; /* integer division */ unsigned int lba_offset = lba - (zone * uzonesize); unsigned int pages; u16 pba; alauda_ensure_map_for_zone(us, zone); /* Not overflowing capacity? */ if (lba >= max_lba) { usb_stor_dbg(us, "Error: Requested lba %u exceeds maximum %u\n", lba, max_lba); result = USB_STOR_TRANSPORT_ERROR; break; } /* Find number of pages we can read in this block */ pages = min(sectors, blocksize - page); len = pages << pageshift; /* Find where this lba lives on disk */ pba = MEDIA_INFO(us).lba_to_pba[zone][lba_offset]; if (pba == UNDEF) { /* this lba was never written */ usb_stor_dbg(us, "Read %d zero pages (LBA %d) page %d\n", pages, lba, page); /* * This is not really an error. It just means * that the block has never been written. * Instead of returning USB_STOR_TRANSPORT_ERROR * it is better to return all zero data. */ memset(buffer, 0, len); } else { usb_stor_dbg(us, "Read %d pages, from PBA %d (LBA %d) page %d\n", pages, pba, lba, page); result = alauda_read_block(us, pba, page, pages, buffer); if (result != USB_STOR_TRANSPORT_GOOD) break; } /* Store the data in the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &offset, TO_XFER_BUF); page = 0; lba++; sectors -= pages; } kfree(buffer); return result; } /* * Write data to a specific sector address */ static int alauda_write_data(struct us_data *us, unsigned long address, unsigned int sectors) { unsigned char *buffer, *blockbuffer; unsigned int page, len, offset; unsigned int blockshift = MEDIA_INFO(us).blockshift; unsigned int pageshift = MEDIA_INFO(us).pageshift; unsigned int blocksize = MEDIA_INFO(us).blocksize; unsigned int pagesize = MEDIA_INFO(us).pagesize; struct scatterlist *sg; u16 lba, max_lba; int result; /* * Since we don't write the user data directly to the device, * we have to create a bounce buffer and move the data a piece * at a time between the bounce buffer and the actual transfer buffer. */ len = min(sectors, blocksize) * pagesize; buffer = kmalloc(len, GFP_NOIO); if (!buffer) return USB_STOR_TRANSPORT_ERROR; /* * We also need a temporary block buffer, where we read in the old data, * overwrite parts with the new data, and manipulate the redundancy data */ blockbuffer = kmalloc_array(pagesize + 64, blocksize, GFP_NOIO); if (!blockbuffer) { kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } /* Figure out the initial LBA and page */ lba = address >> blockshift; page = (address & MEDIA_INFO(us).blockmask); max_lba = MEDIA_INFO(us).capacity >> (pageshift + blockshift); result = USB_STOR_TRANSPORT_GOOD; offset = 0; sg = NULL; while (sectors > 0) { /* Write as many sectors as possible in this block */ unsigned int pages = min(sectors, blocksize - page); len = pages << pageshift; /* Not overflowing capacity? */ if (lba >= max_lba) { usb_stor_dbg(us, "Requested lba %u exceeds maximum %u\n", lba, max_lba); result = USB_STOR_TRANSPORT_ERROR; break; } /* Get the data from the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &offset, FROM_XFER_BUF); result = alauda_write_lba(us, lba, page, pages, buffer, blockbuffer); if (result != USB_STOR_TRANSPORT_GOOD) break; page = 0; lba++; sectors -= pages; } kfree(buffer); kfree(blockbuffer); return result; } /* * Our interface with the rest of the world */ static void alauda_info_destructor(void *extra) { struct alauda_info *info = (struct alauda_info *) extra; int port; if (!info) return; for (port = 0; port < 2; port++) { struct alauda_media_info *media_info = &info->port[port]; alauda_free_maps(media_info); kfree(media_info->lba_to_pba); kfree(media_info->pba_to_lba); } } /* * Initialize alauda_info struct and find the data-write endpoint */ static int init_alauda(struct us_data *us) { struct alauda_info *info; struct usb_host_interface *altsetting = us->pusb_intf->cur_altsetting; nand_init_ecc(); us->extra = kzalloc(sizeof(struct alauda_info), GFP_NOIO); if (!us->extra) return -ENOMEM; info = (struct alauda_info *) us->extra; us->extra_destructor = alauda_info_destructor; info->wr_ep = usb_sndbulkpipe(us->pusb_dev, altsetting->endpoint[0].desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); return 0; } static int alauda_transport(struct scsi_cmnd *srb, struct us_data *us) { int rc; struct alauda_info *info = (struct alauda_info *) us->extra; unsigned char *ptr = us->iobuf; static unsigned char inquiry_response[36] = { 0x00, 0x80, 0x00, 0x01, 0x1F, 0x00, 0x00, 0x00 }; if (srb->cmnd[0] == INQUIRY) { usb_stor_dbg(us, "INQUIRY - Returning bogus response\n"); memcpy(ptr, inquiry_response, sizeof(inquiry_response)); fill_inquiry_response(us, ptr, 36); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == TEST_UNIT_READY) { usb_stor_dbg(us, "TEST_UNIT_READY\n"); return alauda_check_media(us); } if (srb->cmnd[0] == READ_CAPACITY) { unsigned int num_zones; unsigned long capacity; rc = alauda_check_media(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; num_zones = MEDIA_INFO(us).capacity >> (MEDIA_INFO(us).zoneshift + MEDIA_INFO(us).blockshift + MEDIA_INFO(us).pageshift); capacity = num_zones * MEDIA_INFO(us).uzonesize * MEDIA_INFO(us).blocksize; /* Report capacity and page size */ ((__be32 *) ptr)[0] = cpu_to_be32(capacity - 1); ((__be32 *) ptr)[1] = cpu_to_be32(512); usb_stor_set_xfer_buf(ptr, 8, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == READ_10) { unsigned int page, pages; rc = alauda_check_media(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; page = short_pack(srb->cmnd[3], srb->cmnd[2]); page <<= 16; page |= short_pack(srb->cmnd[5], srb->cmnd[4]); pages = short_pack(srb->cmnd[8], srb->cmnd[7]); usb_stor_dbg(us, "READ_10: page %d pagect %d\n", page, pages); return alauda_read_data(us, page, pages); } if (srb->cmnd[0] == WRITE_10) { unsigned int page, pages; rc = alauda_check_media(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; page = short_pack(srb->cmnd[3], srb->cmnd[2]); page <<= 16; page |= short_pack(srb->cmnd[5], srb->cmnd[4]); pages = short_pack(srb->cmnd[8], srb->cmnd[7]); usb_stor_dbg(us, "WRITE_10: page %d pagect %d\n", page, pages); return alauda_write_data(us, page, pages); } if (srb->cmnd[0] == REQUEST_SENSE) { usb_stor_dbg(us, "REQUEST_SENSE\n"); memset(ptr, 0, 18); ptr[0] = 0xF0; ptr[2] = info->sense_key; ptr[7] = 11; ptr[12] = info->sense_asc; ptr[13] = info->sense_ascq; usb_stor_set_xfer_buf(ptr, 18, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) { /* * sure. whatever. not like we can stop the user from popping * the media out of the device (no locking doors, etc) */ return USB_STOR_TRANSPORT_GOOD; } usb_stor_dbg(us, "Gah! Unknown command: %d (0x%x)\n", srb->cmnd[0], srb->cmnd[0]); info->sense_key = 0x05; info->sense_asc = 0x20; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } static struct scsi_host_template alauda_host_template; static int alauda_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; result = usb_stor_probe1(&us, intf, id, (id - alauda_usb_ids) + alauda_unusual_dev_list, &alauda_host_template); if (result) return result; us->transport_name = "Alauda Control/Bulk"; us->transport = alauda_transport; us->transport_reset = usb_stor_Bulk_reset; us->max_lun = 1; result = usb_stor_probe2(us); return result; } static struct usb_driver alauda_driver = { .name = DRV_NAME, .probe = alauda_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = alauda_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(alauda_driver, alauda_host_template, DRV_NAME); |
| 2 2 2 2 2 2 2 2 18 18 18 18 18 18 18 394 26 26 26 394 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Linux input subsystem * * Copyright (c) 2006 Anssi Hannula <anssi.hannula@gmail.com> * Copyright (c) 2006 Dmitry Torokhov <dtor@mail.ru> */ /* #define DEBUG */ #include <linux/input.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/sched.h> #include <linux/slab.h> /* * Check that the effect_id is a valid effect and whether the user * is the owner */ static int check_effect_access(struct ff_device *ff, int effect_id, struct file *file) { if (effect_id < 0 || effect_id >= ff->max_effects || !ff->effect_owners[effect_id]) return -EINVAL; if (file && ff->effect_owners[effect_id] != file) return -EACCES; return 0; } /* * Checks whether 2 effects can be combined together */ static inline int check_effects_compatible(struct ff_effect *e1, struct ff_effect *e2) { return e1->type == e2->type && (e1->type != FF_PERIODIC || e1->u.periodic.waveform == e2->u.periodic.waveform); } /* * Convert an effect into compatible one */ static int compat_effect(struct ff_device *ff, struct ff_effect *effect) { int magnitude; switch (effect->type) { case FF_RUMBLE: if (!test_bit(FF_PERIODIC, ff->ffbit)) return -EINVAL; /* * calculate magnitude of sine wave as average of rumble's * 2/3 of strong magnitude and 1/3 of weak magnitude */ magnitude = effect->u.rumble.strong_magnitude / 3 + effect->u.rumble.weak_magnitude / 6; effect->type = FF_PERIODIC; effect->u.periodic.waveform = FF_SINE; effect->u.periodic.period = 50; effect->u.periodic.magnitude = magnitude; effect->u.periodic.offset = 0; effect->u.periodic.phase = 0; effect->u.periodic.envelope.attack_length = 0; effect->u.periodic.envelope.attack_level = 0; effect->u.periodic.envelope.fade_length = 0; effect->u.periodic.envelope.fade_level = 0; return 0; default: /* Let driver handle conversion */ return 0; } } /** * input_ff_upload() - upload effect into force-feedback device * @dev: input device * @effect: effect to be uploaded * @file: owner of the effect */ int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file) { struct ff_device *ff = dev->ff; struct ff_effect *old; int ret = 0; int id; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; if (effect->type < FF_EFFECT_MIN || effect->type > FF_EFFECT_MAX || !test_bit(effect->type, dev->ffbit)) { dev_dbg(&dev->dev, "invalid or not supported effect type in upload\n"); return -EINVAL; } if (effect->type == FF_PERIODIC && (effect->u.periodic.waveform < FF_WAVEFORM_MIN || effect->u.periodic.waveform > FF_WAVEFORM_MAX || !test_bit(effect->u.periodic.waveform, dev->ffbit))) { dev_dbg(&dev->dev, "invalid or not supported wave form in upload\n"); return -EINVAL; } if (!test_bit(effect->type, ff->ffbit)) { ret = compat_effect(ff, effect); if (ret) return ret; } mutex_lock(&ff->mutex); if (effect->id == -1) { for (id = 0; id < ff->max_effects; id++) if (!ff->effect_owners[id]) break; if (id >= ff->max_effects) { ret = -ENOSPC; goto out; } effect->id = id; old = NULL; } else { id = effect->id; ret = check_effect_access(ff, id, file); if (ret) goto out; old = &ff->effects[id]; if (!check_effects_compatible(effect, old)) { ret = -EINVAL; goto out; } } ret = ff->upload(dev, effect, old); if (ret) goto out; spin_lock_irq(&dev->event_lock); ff->effects[id] = *effect; ff->effect_owners[id] = file; spin_unlock_irq(&dev->event_lock); out: mutex_unlock(&ff->mutex); return ret; } EXPORT_SYMBOL_GPL(input_ff_upload); /* * Erases the effect if the requester is also the effect owner. The mutex * should already be locked before calling this function. */ static int erase_effect(struct input_dev *dev, int effect_id, struct file *file) { struct ff_device *ff = dev->ff; int error; error = check_effect_access(ff, effect_id, file); if (error) return error; spin_lock_irq(&dev->event_lock); ff->playback(dev, effect_id, 0); ff->effect_owners[effect_id] = NULL; spin_unlock_irq(&dev->event_lock); if (ff->erase) { error = ff->erase(dev, effect_id); if (error) { spin_lock_irq(&dev->event_lock); ff->effect_owners[effect_id] = file; spin_unlock_irq(&dev->event_lock); return error; } } return 0; } /** * input_ff_erase - erase a force-feedback effect from device * @dev: input device to erase effect from * @effect_id: id of the effect to be erased * @file: purported owner of the request * * This function erases a force-feedback effect from specified device. * The effect will only be erased if it was uploaded through the same * file handle that is requesting erase. */ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file) { struct ff_device *ff = dev->ff; int ret; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; mutex_lock(&ff->mutex); ret = erase_effect(dev, effect_id, file); mutex_unlock(&ff->mutex); return ret; } EXPORT_SYMBOL_GPL(input_ff_erase); /* * input_ff_flush - erase all effects owned by a file handle * @dev: input device to erase effect from * @file: purported owner of the effects * * This function erases all force-feedback effects associated with * the given owner from specified device. Note that @file may be %NULL, * in which case all effects will be erased. */ int input_ff_flush(struct input_dev *dev, struct file *file) { struct ff_device *ff = dev->ff; int i; dev_dbg(&dev->dev, "flushing now\n"); mutex_lock(&ff->mutex); for (i = 0; i < ff->max_effects; i++) erase_effect(dev, i, file); mutex_unlock(&ff->mutex); return 0; } EXPORT_SYMBOL_GPL(input_ff_flush); /** * input_ff_event() - generic handler for force-feedback events * @dev: input device to send the effect to * @type: event type (anything but EV_FF is ignored) * @code: event code * @value: event value */ int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct ff_device *ff = dev->ff; if (type != EV_FF) return 0; switch (code) { case FF_GAIN: if (!test_bit(FF_GAIN, dev->ffbit) || value > 0xffffU) break; ff->set_gain(dev, value); break; case FF_AUTOCENTER: if (!test_bit(FF_AUTOCENTER, dev->ffbit) || value > 0xffffU) break; ff->set_autocenter(dev, value); break; default: if (check_effect_access(ff, code, NULL) == 0) ff->playback(dev, code, value); break; } return 0; } EXPORT_SYMBOL_GPL(input_ff_event); /** * input_ff_create() - create force-feedback device * @dev: input device supporting force-feedback * @max_effects: maximum number of effects supported by the device * * This function allocates all necessary memory for a force feedback * portion of an input device and installs all default handlers. * @dev->ffbit should be already set up before calling this function. * Once ff device is created you need to setup its upload, erase, * playback and other handlers before registering input device */ int input_ff_create(struct input_dev *dev, unsigned int max_effects) { struct ff_device *ff; size_t ff_dev_size; int i; if (!max_effects) { dev_err(&dev->dev, "cannot allocate device without any effects\n"); return -EINVAL; } if (max_effects > FF_MAX_EFFECTS) { dev_err(&dev->dev, "cannot allocate more than FF_MAX_EFFECTS effects\n"); return -EINVAL; } ff_dev_size = sizeof(struct ff_device) + max_effects * sizeof(struct file *); if (ff_dev_size < max_effects) /* overflow */ return -EINVAL; ff = kzalloc(ff_dev_size, GFP_KERNEL); if (!ff) return -ENOMEM; ff->effects = kcalloc(max_effects, sizeof(struct ff_effect), GFP_KERNEL); if (!ff->effects) { kfree(ff); return -ENOMEM; } ff->max_effects = max_effects; mutex_init(&ff->mutex); dev->ff = ff; dev->flush = input_ff_flush; dev->event = input_ff_event; __set_bit(EV_FF, dev->evbit); /* Copy "true" bits into ff device bitmap */ for_each_set_bit(i, dev->ffbit, FF_CNT) __set_bit(i, ff->ffbit); /* we can emulate RUMBLE with periodic effects */ if (test_bit(FF_PERIODIC, ff->ffbit)) __set_bit(FF_RUMBLE, dev->ffbit); return 0; } EXPORT_SYMBOL_GPL(input_ff_create); /** * input_ff_destroy() - frees force feedback portion of input device * @dev: input device supporting force feedback * * This function is only needed in error path as input core will * automatically free force feedback structures when device is * destroyed. */ void input_ff_destroy(struct input_dev *dev) { struct ff_device *ff = dev->ff; __clear_bit(EV_FF, dev->evbit); if (ff) { if (ff->destroy) ff->destroy(ff); kfree(ff->private); kfree(ff->effects); kfree(ff); dev->ff = NULL; } } EXPORT_SYMBOL_GPL(input_ff_destroy); |
| 296 296 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, }; |
| 21 21 21 3 3 74 74 19 18 19 74 23 23 74 74 74 74 74 230 184 184 229 28 2 2 28 1 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: semantics. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> #include <linux/nospec.h> #include <net/arp.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/addrconf.h> #include "fib_lookup.h" static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; #define DEVINDEX_HASHBITS 8 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ /* Hope, that gcc will optimize it to get rid of dummy loop */ #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ for (nhsel = 0; nhsel < 1; nhsel++) #define change_nexthops(fi) { \ int nhsel; \ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel = 0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ #define endfor_nexthops(fi) } const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, }, [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST, }, [RTN_BROADCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_ANYCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_MULTICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE, }, [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE, }, [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE, }, [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, }, [RTN_NAT] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, }; static void rt_fibinfo_free(struct rtable __rcu **rtp) { struct rtable *rt = rcu_dereference_protected(*rtp, 1); if (!rt) return; /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); * because we waited an RCU grace period before calling * free_fib_info_rcu() */ dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh_common *nhc) { struct fnhe_hash_bucket *hash; int i; hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!hash) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); rt_fibinfo_free(&fnhe->fnhe_rth_output); kfree(fnhe); fnhe = next; } } kfree(hash); } static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) { int cpu; if (!rtp) return; for_each_possible_cpu(cpu) { struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); if (rt) { dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } } free_percpu(rtp); } void fib_nh_common_release(struct fib_nh_common *nhc) { netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); free_nh_exceptions(nhc); } EXPORT_SYMBOL_GPL(fib_nh_common_release); void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif fib_nh_common_release(&fib_nh->nh_common); } /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); if (fi->nh) { nexthop_put(fi->nh); } else { change_nexthops(fi) { fib_nh_release(fi->fib_net, nexthop_nh); } endfor_nexthops(fi); } ip_fib_metrics_put(fi->fib_metrics); kfree(fi); } void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { pr_warn("Freeing alive fib_info %p\n", fi); return; } call_rcu(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { spin_lock_bh(&fib_info_lock); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); /* Paired with READ_ONCE() in fib_create_info(). */ WRITE_ONCE(fib_info_cnt, fib_info_cnt - 1); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); if (fi->nh) { list_del(&fi->nh_list); } else { change_nexthops(fi) { if (!nexthop_nh->fib_nh_dev) continue; hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } /* Paired with READ_ONCE() from fib_table_lookup() */ WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); } static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { const struct fib_nh *onh; if (fi->nh || ofi->nh) return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; if (ofi->fib_nhs == 0) return 0; for_nexthops(fi) { onh = fib_info_nh(ofi, nhsel); if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || #endif #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; if (nh->fib_nh_gw_family == AF_INET && nh->fib_nh_gw4 != onh->fib_nh_gw4) return -1; if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; } endfor_nexthops(fi); return 0; } static inline unsigned int fib_devindex_hashfn(unsigned int val) { return hash_32(val, DEVINDEX_HASHBITS); } static struct hlist_head * fib_info_devhash_bucket(const struct net_device *dev) { u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; return &fib_info_devhash[fib_devindex_hashfn(val)]; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, u32 prefsrc, u32 priority) { unsigned int val = init_val; val ^= (protocol << 8) | scope; val ^= prefsrc; val ^= priority; return val; } static unsigned int fib_info_hashfn_result(unsigned int val) { unsigned int mask = (fib_info_hash_size - 1); return (val ^ (val >> 7) ^ (val >> 12)) & mask; } static inline unsigned int fib_info_hashfn(struct fib_info *fi) { unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, fi->fib_scope, (__force u32)fi->fib_prefsrc, fi->fib_priority); if (fi->nh) { val ^= fib_devindex_hashfn(fi->nh->id); } else { for_nexthops(fi) { val ^= fib_devindex_hashfn(nh->fib_nh_oif); } endfor_nexthops(fi) } return fib_info_hashfn_result(val); } /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) { struct hlist_head *head; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(hash); head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { if (!net_eq(fi->fib_net, net)) continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && cfg->fc_priority == fi->fib_priority && cfg->fc_type == fi->fib_type && cfg->fc_table == fi->fib_tb_id && !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) return fi; } return NULL; } static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn(nfi); head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { if (!net_eq(fi->fib_net, nfi->fib_net)) continue; if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && nfi->fib_type == fi->fib_type && nfi->fib_tb_id == fi->fib_tb_id && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && nh_comp(fi, nfi) == 0) return fi; } return NULL; } /* Check, that the gateway is already configured. * Used only by redirect accept routine. */ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; spin_lock(&fib_info_lock); head = fib_info_devhash_bucket(dev); hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev && nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { spin_unlock(&fib_info_lock); return 0; } } spin_unlock(&fib_info_lock); return -1; } size_t fib_nlmsg_size(struct fib_info *fi) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->nh) payload += nla_total_size(4); /* RTA_NH_ID */ if (nhs) { size_t nh_encapsize = 0; /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ for (i = 0; i < fib_info_num_path(fi); i++) { struct fib_nh_common *nhc = fib_info_nhc(fi, i); if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } } /* all nexthops are packed in a nested attribute */ payload += nla_total_size((nhs * nhsize) + nh_encapsize); } return payload; } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); if (!skb) goto errout; fri.fi = fa->fa_info; fri.tb_id = tb_id; fri.dst = key; fri.dst_len = dst_len; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; if (likely(nhc->nhc_gw_family == AF_INET)) n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); else if (nhc->nhc_gw_family == AF_INET6) n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, nhc->nhc_dev); else n = NULL; if (n) { state = READ_ONCE(n->nud_state); neigh_release(n); } else { return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } return 1; } int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *encap, u16 encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { int err; nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, gfp_flags); if (!nhc->nhc_pcpu_rth_output) return -ENOMEM; if (encap) { struct lwtunnel_state *lwtstate; if (encap_type == LWTUNNEL_ENCAP_NONE) { NL_SET_ERR_MSG(extack, "LWT encap type not specified"); err = -EINVAL; goto lwt_failure; } err = lwtunnel_build_state(net, encap_type, encap, nhc->nhc_family, cfg, &lwtstate, extack); if (err) goto lwt_failure; nhc->nhc_lwtstate = lwtstate_get(lwtstate); } return 0; lwt_failure: rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); nhc->nhc_pcpu_rth_output = NULL; return err; } EXPORT_SYMBOL_GPL(fib_nh_common_init); int fib_nh_init(struct net *net, struct fib_nh *nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack) { int err; nh->fib_nh_family = AF_INET; err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, GFP_KERNEL, extack); if (err) return err; nh->fib_nh_oif = cfg->fc_oif; nh->fib_nh_gw_family = cfg->fc_gw_family; if (cfg->fc_gw_family == AF_INET) nh->fib_nh_gw4 = cfg->fc_gw4; else if (cfg->fc_gw_family == AF_INET6) nh->fib_nh_gw6 = cfg->fc_gw6; nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight = nh_weight; #endif return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, struct netlink_ext_ack *extack) { int nhs = 0; while (rtnh_ok(rtnh, remaining)) { nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ if (remaining > 0) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthops"); nhs = 0; } return nhs; } static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla, struct netlink_ext_ack *extack) { if (nla_len(nla) < sizeof(*gw)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY"); return -EINVAL; } *gw = nla_get_in_addr(nla); return 0; } /* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; struct fib_nh *nh; int ret; change_nexthops(fi) { int attrlen; memset(&fib_cfg, 0, sizeof(fib_cfg)); if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; } if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; } fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; fib_cfg.fc_oif = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla, extack); if (ret) goto errout; if (fib_cfg.fc_gw4) fib_cfg.fc_gw_family = AF_INET; } else if (nlav) { ret = fib_gw_from_via(&fib_cfg, nlav, extack); if (ret) goto errout; } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } fib_cfg.fc_flow = nla_get_u32(nla); } fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); /* RTA_ENCAP_TYPE length checked in * lwtunnel_valid_encap_type_attr */ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) fib_cfg.fc_encap_type = nla_get_u16(nla); } ret = fib_nh_init(net, nexthop_nh, &fib_cfg, rtnh->rtnh_hops + 1, extack); if (ret) goto errout; rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); ret = -EINVAL; nh = fib_info_nh(fi, 0); if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; } #endif ret = 0; errout: return ret; } /* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; if (fib_info_num_path(fi) < 2) return; total = 0; for_nexthops(fi) { if (nh->fib_nh_flags & RTNH_F_DEAD) continue; if (ip_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN) continue; total += nh->fib_nh_weight; } endfor_nexthops(fi); w = 0; change_nexthops(fi) { int upper_bound; if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { upper_bound = -1; } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { w += nexthop_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; } atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); } endfor_nexthops(fi); } #else /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); return -EINVAL; } #define fib_rebalance(fi) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, const struct fib_config *cfg, struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; ret = lwtunnel_build_state(net, encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); lwtstate_free(lwtstate); } return result; } int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; #endif if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; if (cfg->fc_nh_id) { if (fi->nh && cfg->fc_nh_id == fi->nh->id) return 0; return 1; } if (fi->nh) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) return 1; return 0; } if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && cfg->fc_flow != nh->nh_tclassid) return 1; #endif if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (!cfg->fc_mp) return 0; rtnh = cfg->fc_mp; remaining = cfg->fc_mp_len; for_nexthops(fi) { int attrlen; if (!rtnh_ok(rtnh, remaining)) return -EINVAL; if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) return 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); int err; nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { __be32 gw; err = fib_gw_from_attr(&gw, nla, extack); if (err) return err; if (nh->fib_nh_gw_family != AF_INET || gw != nh->fib_nh_gw4) return 1; } else if (nlav) { struct fib_config cfg2; err = fib_gw_from_via(&cfg2, nlav, extack); if (err) return err; switch (nh->fib_nh_gw_family) { case AF_INET: if (cfg2.fc_gw_family != AF_INET || cfg2.fc_gw4 != nh->fib_nh_gw4) return 1; break; case AF_INET6: if (cfg2.fc_gw_family != AF_INET6 || ipv6_addr_cmp(&cfg2.fc_gw6, &nh->fib_nh_gw6)) return 1; break; } } #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } if (nla_get_u32(nla) != nh->nh_tclassid) return 1; } #endif } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); #endif return 0; } bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) { struct nlattr *nla; int remaining; if (!cfg->fc_mx) return true; nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); u32 fi_val, val; if (!type) continue; if (type > RTAX_MAX) return false; type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; val = nla_get_u32(nla); } fi_val = fi->fib_metrics->metrics[type - 1]; if (type == RTAX_FEATURES) fi_val &= ~DST_FEATURE_ECN_CA; if (fi_val != val) return false; } return true; } static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, u32 table, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = table, .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, .fc_ifindex = nh->fib_nh_oif, .fc_gateway = nh->fib_nh_gw6, }; struct fib6_nh fib6_nh = {}; int err; err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); if (!err) { nh->fib_nh_dev = fib6_nh.fib_nh_dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); nh->fib_nh_oif = nh->fib_nh_dev->ifindex; nh->fib_nh_scope = RT_SCOPE_LINK; ipv6_stub->fib6_nh_release(&fib6_nh); } return err; } /* * Picture * ------- * * Semantics of nexthop is very messy by historical reasons. * We have to take into account, that: * a) gateway can be actually local interface address, * so that gatewayed route is direct. * b) gateway must be on-link address, possibly * described not by an ifaddr, but also by a direct route. * c) If both gateway and interface are specified, they should not * contradict. * d) If we use tunnel routes, gateway could be not on-link. * * Attempt to reconcile all of these (alas, self-contradictory) conditions * results in pretty ugly and hairy code with obscure logic. * * I chose to generalized it instead, so that the size * of code does not increase practically, but it becomes * much more general. * Every prefix is assigned a "scope" value: "host" is local address, * "link" is direct route, * [ ... "site" ... "interior" ... ] * and "universe" is true gateway route with global meaning. * * Every prefix refers to a set of "nexthop"s (gw, oif), * where gw must have narrower scope. This recursion stops * when gw has LOCAL scope or if "nexthop" is declared ONLINK, * which means that gw is forced to be on link. * * Code is still hairy, but now it is apparently logically * consistent and very flexible. F.e. as by-product it allows * to co-exists in peace independent exterior and interior * routing processes. * * Normally it looks as following. * * {universe prefix} -> (gw, oif) [scope link] * | * |-> {link prefix} -> (gw, oif) [scope local] * | * |-> {local prefix} (terminal node) */ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { struct net_device *dev; struct fib_result res; int err = 0; if (nh->fib_nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; if (scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); return -EINVAL; } dev = __dev_get_by_index(net, nh->fib_nh_oif); if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); return -ENODEV; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); return -ENETDOWN; } addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); if (addr_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); return -EINVAL; } if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; nh->fib_nh_dev = dev; netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_LINK; return 0; } rcu_read_lock(); { struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->fib_nh_gw4, .flowi4_scope = scope + 1, .flowi4_oif = nh->fib_nh_oif, .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) err = fib_table_lookup(tbl, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE | FIB_LOOKUP_NOREF); /* on error or if no table given do full lookup. This * is needed for example when nexthops are in the local * table rather than the given table */ if (!tbl || err) { err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE); } if (err) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } nh->fib_nh_scope = res.scope; nh->fib_nh_oif = FIB_RES_OIF(res); nh->fib_nh_dev = dev = FIB_RES_DEV(res); if (!dev) { NL_SET_ERR_MSG(extack, "No egress device for nexthop gateway"); goto out; } netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; out: rcu_read_unlock(); return err; } static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, struct netlink_ext_ack *extack) { struct in_device *in_dev; int err; if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->fib_nh_oif); if (!in_dev) goto out; err = -ENETDOWN; if (!(in_dev->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; } nh->fib_nh_dev = in_dev->dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_HOST; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; out: rcu_read_unlock(); return err; } int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { int err; if (nh->fib_nh_gw_family == AF_INET) err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else err = fib_check_nh_nongw(net, nh, extack); return err; } static struct hlist_head * fib_info_laddrhash_bucket(const struct net *net, __be32 val) { u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, fib_info_hash_bits); return &fib_info_laddrhash[slot]; } static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *new_laddrhash, unsigned int new_size) { struct hlist_head *old_info_hash, *old_laddrhash; unsigned int old_size = fib_info_hash_size; unsigned int i; spin_lock_bh(&fib_info_lock); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; fib_info_hash_size = new_size; fib_info_hash_bits = ilog2(new_size); for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, head, fib_hash) { struct hlist_head *dest; unsigned int new_hash; new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); } } fib_info_hash = new_info_hash; fib_info_laddrhash = new_laddrhash; for (i = 0; i < old_size; i++) { struct hlist_head *lhead = &old_laddrhash[i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { struct hlist_head *ldest; ldest = fib_info_laddrhash_bucket(fi->fib_net, fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, ldest); } } spin_unlock_bh(&fib_info_lock); kvfree(old_info_hash); kvfree(old_laddrhash); } __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { struct fib_nh *nh; if (nhc->nhc_family != AF_INET) return inet_select_addr(nhc->nhc_dev, 0, scope); nh = container_of(nhc, struct fib_nh, nh_common); nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; } __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) return nh->nh_saddr; } return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { u32 tb_id = cfg->fc_table; int rc; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, tb_id); if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, RT_TABLE_LOCAL); } if (rc != RTN_LOCAL) return false; } return true; } struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; if (cfg->fc_type > RTN_MAX) goto err_inval; /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; } if (cfg->fc_nh_id) { if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { refcount_inc(&fi->fib_treeref); return fi; } } nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } nhs = 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } #endif err = -ENOBUFS; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fib_info_cnt) >= fib_info_hash_size) { unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; size_t bytes; if (!new_size) new_size = 16; bytes = (size_t)new_size * sizeof(struct hlist_head *); new_info_hash = kvzalloc(bytes, GFP_KERNEL); new_laddrhash = kvzalloc(bytes, GFP_KERNEL); if (!new_info_hash || !new_laddrhash) { kvfree(new_info_hash); kvfree(new_laddrhash); } else { fib_info_hash_move(new_info_hash, new_laddrhash, new_size); } if (!fib_info_hash_size) goto failure; } fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) goto failure; fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); } fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; if (nh) { if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = 0; fi->nh = nh; } } else { change_nexthops(fi) { nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mp) err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); else err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); } if (err != 0) goto failure; if (fib_props[cfg->fc_type].error) { if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; } goto link_it; } else { switch (cfg->fc_type) { case RTN_UNICAST: case RTN_LOCAL: case RTN_BROADCAST: case RTN_ANYCAST: case RTN_MULTICAST: break; default: NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } if (cfg->fc_scope > RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (fi->nh) { err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); if (err) goto failure; } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ if (nhs != 1) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); goto err_inval; } if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); } else { int linkdown = 0; change_nexthops(fi) { err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, cfg->fc_table, cfg->fc_scope, extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) linkdown++; } endfor_nexthops(fi) if (linkdown == fi->fib_nhs) fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; } if (!fi->nh) { change_nexthops(fi) { fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, fi->fib_scope); if (nexthop_nh->fib_nh_gw_family == AF_INET6) fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); } link_it: ofi = fib_find_info(fi); if (ofi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); refcount_inc(&ofi->fib_treeref); return ofi; } refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); spin_lock_bh(&fib_info_lock); fib_info_cnt++; hlist_add_head(&fi->fib_hash, &fib_info_hash[fib_info_hashfn(fi)]); if (fi->fib_prefsrc) { struct hlist_head *head; head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, head); } if (fi->nh) { list_add(&fi->nh_list, &nh->fi_list); } else { change_nexthops(fi) { struct hlist_head *head; if (!nexthop_nh->fib_nh_dev) continue; head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } spin_unlock_bh(&fib_info_lock); return fi; err_inval: err = -EINVAL; failure: if (fi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); } return ERR_PTR(err); } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, u8 rt_family, unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; if (nhc->nhc_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; rcu_read_lock(); switch (nhc->nhc_family) { case AF_INET: if (ip_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; case AF_INET6: if (ip6_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; } rcu_read_unlock(); } switch (nhc->nhc_gw_family) { case AF_INET: if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) goto nla_put_failure; break; case AF_INET6: /* if gateway family does not match nexthop family * gateway is encoded as RTA_VIA */ if (rt_family != nhc->nhc_gw_family) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); } else if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0) { goto nla_put_failure; } break; } *flags |= (nhc->nhc_flags & (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); if (!skip_oif && nhc->nhc_dev && nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) goto nla_put_failure; if (nhc->nhc_lwtstate && lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; unsigned char flags = 0; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_hops = nh_weight - 1; rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_add_nexthop); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (unlikely(fi->nh)) { if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0) goto nla_put_failure; goto mp_end; } for_nexthops(fi) { u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID nh_tclassid = nh->nh_tclassid; #endif if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, AF_INET, nh_tclassid) < 0) goto nla_put_failure; } endfor_nexthops(fi); mp_end: nla_nest_end(skb, mp); return 0; nla_put_failure: return -EMSGSIZE; } #else static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { return 0; } #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags) { unsigned int nhs = fib_info_num_path(fri->fi); struct fib_info *fi = fri->fi; u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp); if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->nh) { if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode)) goto offload; } if (nhs == 1) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (nh->nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; } #endif } else { if (fib_add_multipath(skb, fi) < 0) goto nla_put_failure; } offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; if (fri->offload_failed) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* * Update FIB if: * - local address disappeared -> we must delete all the entries * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ int fib_sync_down_addr(struct net_device *dev, __be32 local) { int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); struct hlist_head *head; struct fib_info *fi; int ret = 0; if (!fib_info_laddrhash || local == 0) return 0; head = fib_info_laddrhash_bucket(net, local); hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net) || fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; ret++; } } return ret; } static int call_fib_nh_notifiers(struct fib_nh *nh, enum fib_event_type event_type) { bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); struct fib_nh_notifier_info info = { .fib_nh = nh, }; switch (event_type) { case FIB_EVENT_NH_ADD: if (nh->fib_nh_flags & RTNH_F_DEAD) break; if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) break; return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || (nh->fib_nh_flags & RTNH_F_DEAD)) return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); break; default: break; } return NOTIFY_DONE; } /* Update the PMTU of exceptions when: * - the new MTU of the first hop becomes smaller than the PMTU * - the old MTU was the same as the PMTU, and it limited discovery of * larger MTUs on the path. With that limit raised, we can now * discover larger MTUs * A special case is locked exceptions, for which the PMTU is smaller * than the minimal accepted PMTU: * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!bucket) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); fnhe; fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { if (fnhe->fnhe_mtu_locked) { if (new <= fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; fnhe->fnhe_mtu_locked = false; } } else if (new < fnhe->fnhe_pmtu || orig == fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; } } } } void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { struct hlist_head *head = fib_info_devhash_bucket(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { if (nh->fib_nh_dev == dev) fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed * * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { struct hlist_head *head = fib_info_devhash_bucket(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; int ret = 0; if (force) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int dead; BUG_ON(!fi->fib_nhs); if (nh->fib_nh_dev != dev || fi == prev_fi) continue; prev_fi = fi; dead = 0; change_nexthops(fi) { if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) dead++; else if (nexthop_nh->fib_nh_dev == dev && nexthop_nh->fib_nh_scope != scope) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; break; } call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_DEL); dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (event == NETDEV_UNREGISTER && nexthop_nh->fib_nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: fi->fib_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: fi->fib_flags |= RTNH_F_LINKDOWN; break; } ret++; } fib_rebalance(fi); } return ret; } /* Must be invoked inside of an RCU protected region. */ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; struct fib_alias *fa, *fa1 = NULL; u32 last_prio = res->fi->fib_priority; dscp_t last_dscp = 0; hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; struct fib_nh_common *nhc; if (fa->fa_slen != slen) continue; if (fa->fa_dscp && fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos)) continue; if (fa->tb_id != tb->tb_id) continue; if (next_fi->fib_priority > last_prio && fa->fa_dscp == last_dscp) { if (last_dscp) continue; break; } if (next_fi->fib_flags & RTNH_F_DEAD) continue; last_dscp = fa->fa_dscp; last_prio = next_fi->fib_priority; if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; nhc = fib_info_nhc(next_fi, 0); if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); if (!fi) { if (next_fi != res->fi) break; fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } fi = next_fi; order++; } if (order <= 0 || !fi) { if (fa1) fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); fa1->fa_default = last_idx; out: return; } /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. * * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; struct hlist_head *head; struct fib_nh *nh; int ret; if (!(dev->flags & IFF_UP)) return 0; if (nh_flags & RTNH_F_DEAD) { unsigned int flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) nh_flags |= RTNH_F_LINKDOWN; } prev_fi = NULL; head = fib_info_devhash_bucket(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int alive; BUG_ON(!fi->fib_nhs); if (nh->fib_nh_dev != dev || fi == prev_fi) continue; prev_fi = fi; alive = 0; change_nexthops(fi) { if (!(nexthop_nh->fib_nh_flags & nh_flags)) { alive++; continue; } if (!nexthop_nh->fib_nh_dev || !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->fib_nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; nexthop_nh->fib_nh_flags &= ~nh_flags; call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } fib_rebalance(fi); } return ret; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static bool fib_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; if (nh->fib_nh_scope == RT_SCOPE_LINK) { struct neighbour *n; rcu_read_lock(); if (likely(nh->fib_nh_gw_family == AF_INET)) n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); else if (nh->fib_nh_gw_family == AF_INET6) n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); else n = NULL; if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); } return !!(state & NUD_VALID); } void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; bool first = false; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } change_nexthops(fi) { if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { if (!fib_good_nh(nexthop_nh)) continue; if (!first) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; first = true; } } if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; return; } endfor_nexthops(fi); } #endif void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { if (fl4->flowi4_oif) goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); } else #endif if (!res->prefixlen && res->table->tb_num_default > 1 && res->type == RTN_UNICAST) fib_select_default(fl4, res); check_saddr: if (!fl4->saddr) fl4->saddr = fib_result_prefsrc(net, res); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_pid) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_pid = fl ? fl->fl_pid : 0; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, __entry->fl_pid, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(unsigned long, fl_break_time) __field(unsigned long, fl_downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_break_time = fl ? fl->fl_break_time : 0; __entry->fl_downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_break_time, __entry->fl_downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->fl_owner = fl->fl_owner; __entry->fl_flags = fl->fl_flags; __entry->fl_type = fl->fl_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lock *lease, struct file_lock *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->fl_flags; __entry->l_fl_type = lease->fl_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->fl_flags; __entry->b_fl_type = breaker->fl_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 285 284 285 285 285 285 6 6 285 285 285 285 285 285 285 285 285 284 285 285 285 285 285 285 285 285 285 285 285 285 285 285 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ialloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * BSD ufs-inspired inode and directory allocation by * Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/fs.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/cred.h> #include <asm/byteorder.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * ialloc.c contains the inodes allocation and deallocation routines */ /* * The free inodes are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. */ /* * To avoid calling the atomic setbit hundreds or thousands of times, we only * need to use it within a single byte (to ensure we get endianness right). * We can use memset for the rest of the bitmap as there are no other users. */ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) { int i; if (start_bit >= end_bit) return; ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) ext4_set_bit(i, bitmap); if (i < end_bit) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); set_bitmap_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } static int ext4_validate_inode_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. * * Return buffer_head of bitmap on success, or an ERR_PTR on error. */ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Inode bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } verify: err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no * race conditions we have to worry about. The inode * is not on the hash-lists, and it cannot be reached * through the filesystem because the directory entry * has been deleted earlier. * * HOWEVER: we must make sure that we get no aliases, * which means that we have to call "clear_inode()" * _before_ we mark the inode not in use in the inode * bitmaps. Otherwise a newly created file might use * the same inode number (not actually the same pointer * though), and then we'd have two inodes sharing the * same inode number and space on the harddisk. */ void ext4_free_inode(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t block_group; unsigned long bit; struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " "nonexistent device\n", __func__, __LINE__); return; } if (atomic_read(&inode->i_count) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, atomic_read(&inode->i_count)); return; } if (inode->i_nlink) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); dquot_initialize(inode); dquot_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); /* Do this BEFORE marking the inode not in use or returning an error */ ext4_clear_inode(inode); es = sbi->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ if (IS_ERR(bitmap_bh)) { fatal = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, block_group); if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { fatal = -EFSCORRUPTED; goto error_return; } } BUFFER_TRACE(bitmap_bh, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (fatal) goto error_return; fatal = -ESRCH; gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bh2, EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; } count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) percpu_counter_inc(&sbi->s_freeinodes_counter); if (sbi->s_log_groups_per_flex) { struct flex_groups *fg; fg = sbi_array_rcu_deref(sbi, s_flex_groups, ext4_flex_group(sbi, block_group)); atomic_inc(&fg->free_inodes); if (is_directory) atomic_dec(&fg->used_dirs); } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); out: if (cleared) { BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); } struct orlov_stats { __u64 free_clusters; __u32 free_inodes; __u32 used_dirs; }; /* * Helper function for Orlov's allocator; returns critical information * for a particular block group or flex_bg. If flex_size is 1, then g * is a block group number; otherwise it is flex_bg number. */ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; if (flex_size > 1) { struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, g); stats->free_inodes = atomic_read(&fg->free_inodes); stats->free_clusters = atomic64_read(&fg->free_clusters); stats->used_dirs = atomic_read(&fg->used_dirs); return; } desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; stats->free_clusters = 0; stats->used_dirs = 0; } } /* * Orlov's allocator for directories. * * We always try to spread first-level directories. * * If there are blockgroups with both free inodes and free clusters counts * not worse than average we return one with smallest directory count. * Otherwise we simply return a random group. * * For the rest rules look so: * * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free clusters left (min_clusters) or * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). */ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei, grp_free; ext4_fsblk_t freec, avefreec; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); struct dx_hash_info hinfo; ngroups = real_ngroups; if (flex_size > 1) { ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); avefreec = freec; do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; if (qstr) { hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); if (!stats.free_inodes) continue; if (stats.used_dirs >= best_ndir) continue; if (stats.free_inodes < avefreei) continue; if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; best_ndir = stats.used_dirs; } if (ret) goto fallback; found_flex_bg: if (flex_size == 1) { *group = grp; return 0; } /* * We pack inodes at the beginning of the flexgroup's * inode tables. Block allocation decisions will do * something similar, although regular files will * start at 2nd block group of the flexgroup. See * ext4_ext_find_goal() and ext4_find_near(). */ grp *= flex_size; for (i = 0; i < flex_size; i++) { if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = grp+i; return 0; } } goto fallback; } max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an * inode for this parent directory */ if (EXT4_I(parent)->i_last_alloc_group != ~0) { parent_group = EXT4_I(parent)->i_last_alloc_group; if (flex_size > 1) parent_group >>= sbi->s_log_groups_per_flex; } for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; get_orlov_stats(sb, grp, flex_size, &stats); if (stats.used_dirs >= max_dirs) continue; if (stats.free_inodes < min_inodes) continue; if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } fallback: ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); if (desc) { grp_free = ext4_free_inodes_count(sb, desc); if (grp_free && grp_free >= avefreei) { *group = grp; return 0; } } } if (avefreei) { /* * The free-inodes counter is approximate, and for really small * filesystems the above test can fail to find any blockgroups */ avefreei = 0; goto fallback_retry; } return -1; } static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* * Try to place the inode is the same flex group as its * parent. If we can't find space, use the Orlov algorithm to * find another flex group, and store that information in the * parent directory's inode information so that use that flex * group for future allocations. */ if (flex_size > 1) { int retry = 0; try_again: parent_group &= ~(flex_size-1); last = parent_group + flex_size; if (last > ngroups) last = ngroups; for (i = parent_group; i < last; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = i; return 0; } } if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { retry = 1; parent_group = EXT4_I(parent)->i_last_alloc_group; goto try_again; } /* * If this didn't work, use the Orlov search algorithm * to find a new flex group; we pass in the mode to * avoid the topdir algorithms. */ *group = parent_group + flex_size; if (*group > ngroups) *group = 0; return find_group_orlov(sb, parent, group, mode, NULL); } /* * Try to place the inode in its parent directory */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; /* * We're going to place this inode in a different blockgroup from its * parent. We want to cause files in a common directory to all land in * the same blockgroup. But we want files which are in a different * directory which shares a blockgroup with our parent to land in a * different blockgroup. * * So add our directory's i_ino into the starting point for the hash. */ *group = (*group + parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some free * blocks. */ for (i = 1; i < ngroups; i <<= 1) { *group += i; if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; } /* * That failed: try linear search for a free inode, even if that group * has no free blocks. */ *group = parent_group; for (i = 0; i < ngroups; i++) { if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc)) return 0; } return -1; } /* * In no journal mode, if an inode has recently been deleted, we want * to avoid reusing it until we're reasonably sure the inode table * block has been written back to disk. (Yes, these values are * somewhat arbitrary...) */ #define RECENTCY_MIN 60 #define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int offset, ret = 0; int recentcy = RECENTCY_MIN; u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. */ goto out; offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); /* i_dtime is only 32 bits on disk, but we only care about relative * times in the range of a few minutes (i.e. long enough to sync a * recently-deleted inode to disk), so using the low 32 bits of the * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; if (dtime && time_before32(dtime, now) && time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } static int find_inode_bit(struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap, unsigned long *ino) { bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); next: *ino = ext4_find_next_zero_bit((unsigned long *) bitmap->b_data, EXT4_INODES_PER_GROUP(sb), *ino); if (*ino >= EXT4_INODES_PER_GROUP(sb)) goto not_found; if (check_recently_deleted && recently_deleted(sb, group, *ino)) { recently_deleted_ino = *ino; *ino = *ino + 1; if (*ino < EXT4_INODES_PER_GROUP(sb)) goto next; goto not_found; } return 1; not_found: if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) return 0; /* * Not reusing recently deleted inodes is mostly a preference. We don't * want to report ENOSPC or skew allocation patterns because of that. * So return even recently deleted inode if we could find better in the * given range. */ *ino = recently_deleted_ino; return 1; } int ext4_mark_inode_used(struct super_block *sb, int ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; struct ext4_group_desc *gdp; ext4_group_t group; int bit; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto out; group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (IS_ERR(inode_bitmap_bh)) return PTR_ERR(inode_bitmap_bh); if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { err = 0; goto out; } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !group_desc_bh) { err = -EINVAL; goto out; } ext4_set_bit(bit, inode_bitmap_bh->b_data); BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } err = sync_dirty_buffer(inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); sync_dirty_buffer(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (bit >= free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - bit - 1)); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); sync_dirty_buffer(group_desc_bh); out: return err; } static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, bool encrypt) { struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); nblocks += (S_ISDIR(mode) ? 2 : 1) * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, acl_size, true /* is_create */); posix_acl_release(p); } #endif #ifdef CONFIG_SECURITY { int num_security_xattrs = 1; #ifdef CONFIG_INTEGRITY num_security_xattrs++; #endif /* * We assume that security xattrs are never more than 1k. * In practice they are under 128 bytes. */ nblocks += num_security_xattrs * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, 1024, true /* is_create */); } #endif if (encrypt) nblocks += __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, FSCRYPT_SET_CONTEXT_MAX_SIZE, true /* is_create */); return nblocks; } /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of * the groups with above-average free space, that group with the fewest * directories already is chosen. * * For other inodes, search forward from the parent directory's block * group to find a free inode. */ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err; struct inode *ret; ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp = NULL; bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); sb = dir->i_sb; sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); /* * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ if (owner) { inode->i_mode = mode; i_uid_write(inode, owner[0]); i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); if (!(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_prepare_new_inode(dir, inode, &encrypt); if (err) goto out; } err = dquot_initialize(inode); if (err) goto out; if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); if (ret2 < 0) { err = ret2; goto out; } nblocks += ret2; } if (!goal) goal = sbi->s_inode_goal; if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ret2 = 0; goto got_group; } if (S_ISDIR(mode)) ret2 = find_group_orlov(sb, dir, &group, mode, qstr); else ret2 = find_group_other(sb, dir, &group, mode); got_group: EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; /* * Normally we will only go through one pass of this loop, * unless we get unlucky and it turns out the group we selected * had its last inode grabbed by someone else. */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto out; /* * Check free inodes count before loading bitmap. */ if (ext4_free_inodes_count(sb, gdp) == 0) goto next_group; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); /* * Skip groups with already-known suspicious inode * tables */ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; } repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(NULL, dir->i_sb, line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); if (ret2) { /* Someone already took the bit. Repeat the search * with lock held. */ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (ret2) { ext4_set_bit(ino, inode_bitmap_bh->b_data); ret2 = 0; } else { ret2 = 1; /* we didn't grab the inode */ } } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: if (++group == ngroups) group = 0; } err = -ENOSPC; goto out; got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; struct ext4_group_info *grp = NULL; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); if (!grp) { err = -EFSCORRUPTED; goto out; } down_read(&grp->alloc_sem); /* * protect vs itable * lazyinit */ } ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (ino > free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) up_read(&grp->alloc_sem); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); if (sbi->s_log_groups_per_flex) { ext4_group_t f = ext4_flex_group(sbi, group); atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, f)->used_dirs); } } if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) { ext4_std_error(sb, err); goto out; } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_inodes); } inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ei->i_crtime = inode->i_mtime; memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; ei->i_disksize = 0; /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { /* * Likely a bitmap corruption causing inode to be allocated * twice. */ err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); if (err) goto fail_drop; /* * Since the encryption xattr will always be unique, create it first so * that it's less likely to end up in an external xattr block and * prevent its deduplication. */ if (encrypt) { err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } if (!(ei->i_flags & EXT4_EA_INODE_FL)) { err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } if (ext4_handle_valid(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; ei->i_datasync_tid = handle->h_transaction->t_tid; } err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); goto fail_free_drop; } ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); fail_drop: clear_nlink(inode); unlock_new_inode(inode); out: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); } /* Verify that we are loading a valid orphan from disk */ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) return ERR_CAST(bitmap_bh); /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include * inodes that were being truncated, so we can't check i_nlink==0. */ if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); brelse(bitmap_bh); return inode; } /* * If the orphans has i_nlinks > 0 then it should be able to * be truncated, otherwise it won't be removed from the orphan * list during processing and an infinite loop will result. * Similarly, it must not be a bad inode. */ if ((inode->i_nlink && !ext4_can_truncate(inode)) || is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); return inode; bad_orphan: ext4_error(sb, "bad orphan inode %lu", ino); if (bitmap_bh) printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_ERR "max_ino=%lu\n", max_ino); printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); return ERR_PTR(err); } unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_inodes: " "stored = %u, computed = %lu, %lu\n", le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); return desc_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; #endif } /* Called at mount-time, super-block is locked */ unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; ext4_group_t i, ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; count += ext4_used_dirs_count(sb, gdp); } return count; } /* * Zeroes not yet zeroed inode table - just write zeroes through the whole * inode table. Must be called without any spinlock held. The only place * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct buffer_head *group_desc_bh; handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; /* * We do not need to lock this, because we are the only one * handling this flag. */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } down_write(&grp->alloc_sem); /* * If inode bitmap was already initialized there may be some * used inodes so we need to skip blocks with used inodes in * inode table. */ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { used_inos = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); /* Bogus inode unused count? */ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { ext4_error(sb, "Something is wrong with group %u: " "used itable blocks: %d; " "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; goto err_out; } used_inos += group * EXT4_INODES_PER_GROUP(sb); /* * Are there some uninitialized inodes in the inode table * before the first normal inode? */ if ((used_blks != sbi->s_itb_per_group) && (used_inos < EXT4_FIRST_INO(sb))) { ext4_error(sb, "Something is wrong with group %u: " "itable unused count: %u; " "itables initialized count: %ld", group, ext4_itable_unused_count(sb, gdp), used_inos); ret = 1; goto err_out; } } blk = ext4_inode_table(sb, gdp) + used_blks; num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (ret) goto err_out; /* * Skip zeroout if the inode table is full. But we set the ZEROED * flag anyway, because obviously, when it is full it does not need * further zeroing. */ if (unlikely(num == 0)) goto skip_zeroout; ext4_debug("going to zero out inode table in group %d\n", group); ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); if (ret < 0) goto err_out; if (barrier) blkdev_issue_flush(sb->s_bdev); skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); ret = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); err_out: up_write(&grp->alloc_sem); ext4_journal_stop(handle); out: return ret; } |
| 3898 1554 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Filesystem management and hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #ifndef _SECURITY_LANDLOCK_FS_H #define _SECURITY_LANDLOCK_FS_H #include <linux/fs.h> #include <linux/init.h> #include <linux/rcupdate.h> #include "ruleset.h" #include "setup.h" /** * struct landlock_inode_security - Inode security blob * * Enable to reference a &struct landlock_object tied to an inode (i.e. * underlying object). */ struct landlock_inode_security { /** * @object: Weak pointer to an allocated object. All assignments of a * new object are protected by the underlying inode->i_lock. However, * atomically disassociating @object from the inode is only protected * by @object->lock, from the time @object's usage refcount drops to * zero to the time this pointer is nulled out (cf. release_inode() and * hook_sb_delete()). Indeed, such disassociation doesn't require * inode->i_lock thanks to the careful rcu_access_pointer() check * performed by get_inode_object(). */ struct landlock_object __rcu *object; }; /** * struct landlock_file_security - File security blob * * This information is populated when opening a file in hook_file_open, and * tracks the relevant Landlock access rights that were available at the time * of opening the file. Other LSM hooks use these rights in order to authorize * operations on already opened files. */ struct landlock_file_security { /** * @allowed_access: Access rights that were available at the time of * opening the file. This is not necessarily the full set of access * rights available at that time, but it's the necessary subset as * needed to authorize later operations on the open file. */ access_mask_t allowed_access; }; /** * struct landlock_superblock_security - Superblock security blob * * Enable hook_sb_delete() to wait for concurrent calls to release_inode(). */ struct landlock_superblock_security { /** * @inode_refs: Number of pending inodes (from this superblock) that * are being released by release_inode(). * Cf. struct super_block->s_fsnotify_inode_refs . */ atomic_long_t inode_refs; }; static inline struct landlock_file_security * landlock_file(const struct file *const file) { return file->f_security + landlock_blob_sizes.lbs_file; } static inline struct landlock_inode_security * landlock_inode(const struct inode *const inode) { return inode->i_security + landlock_blob_sizes.lbs_inode; } static inline struct landlock_superblock_security * landlock_superblock(const struct super_block *const superblock) { return superblock->s_security + landlock_blob_sizes.lbs_superblock; } __init void landlock_add_fs_hooks(void); int landlock_append_fs_rule(struct landlock_ruleset *const ruleset, const struct path *const path, access_mask_t access_hierarchy); #endif /* _SECURITY_LANDLOCK_FS_H */ |
| 2054 142 3597 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/sched.h> #include <linux/thread_info.h> #include <asm/uaccess.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * * Passing down mm_struct allows to define untagging rules on per-process * basis. * * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef untagged_addr_remote #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ untagged_addr(addr); \ }) #endif /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { unsigned long res; instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res; might_fault(); instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } #ifdef INLINE_COPY_FROM_USER static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); return res; } #else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } #else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (check_copy_size(to, n, false)) n = _copy_from_user(to, from, n); return n; } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (check_copy_size(from, n, true)) n = _copy_to_user(to, from, n); return n; } #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** * probe_subpage_writeable: probe the user range for write faults at sub-page * granularity (e.g. arm64 MTE) * @uaddr: start of address range * @size: size of address range * * Returns 0 on success, the number of bytes not probed on fault. * * It is expected that the caller checked for the write permission of each * page in the range either by put_user() or GUP. The architecture port can * implement a more efficient get_user() probing if the same sub-page faults * are triggered by either a read or a write. */ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) { return 0; } #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); #ifndef __get_kernel_nofault #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ type data; \ if (__get_user(data, p)) \ goto label; \ *(type *)dst = data; \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(dst); \ type data = *(type *)src; \ if (__put_user(data, p)) \ goto label; \ } while (0) #endif /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */ |
| 12 12 12 12 12 12 3257 3257 3257 3257 3257 3256 3256 2894 2894 1 2893 2894 2892 2894 3257 3257 3257 3257 2753 2751 2753 2751 3157 2850 2847 2849 2850 2893 2894 2892 2892 2849 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 | // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/file.c - sysfs regular (text) file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/module.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/mm.h> #include "sysfs.h" /* * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { struct kobject *kobj = kn->parent->priv; if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; } /* * Reads on sysfs are handled through seq_file, which takes care of hairy * details like buffering and seeking. The following function pipes * sysfs_ops->show() result through seq_file. */ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; struct kobject *kobj = of->kn->parent->priv; const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; char *buf; if (WARN_ON_ONCE(!ops->show)) return -EINVAL; /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } memset(buf, 0, PAGE_SIZE); count = ops->show(kobj, of->kn->priv, buf); if (count < 0) return count; /* * The code works fine with PAGE_SIZE return but it's likely to * indicate truncated result or overflow in normal use cases. */ if (count >= (ssize_t)PAGE_SIZE) { printk("fill_read_buffer: %pS returned bad count\n", ops->show); /* Try to struggle along */ count = PAGE_SIZE - 1; } seq_commit(sf, count); return 0; } static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = of->kn->parent->priv; loff_t size = file_inode(of->file)->i_size; if (!count) return 0; if (size) { if (pos >= size) return 0; if (pos + count > size) count = size - pos; } if (!battr->read) return -EIO; return battr->read(of->file, kobj, battr, buf, pos, count); } /* kernfs read callback for regular sysfs files with pre-alloc */ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; ssize_t len; /* * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); if (len < 0) return len; if (pos) { if (len <= pos) return 0; len -= pos; memmove(buf, buf + pos, len); } return min_t(ssize_t, count, len); } /* kernfs write callback for regular sysfs files */ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; if (!count) return 0; return ops->store(kobj, of->kn->priv, buf, count); } /* kernfs write callback for bin sysfs files */ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = of->kn->parent->priv; loff_t size = file_inode(of->file)->i_size; if (size) { if (size <= pos) return -EFBIG; count = min_t(ssize_t, count, size - pos); } if (!count) return 0; if (!battr->write) return -EIO; return battr->write(of->file, kobj, battr, buf, pos, count); } static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, struct vm_area_struct *vma) { struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = of->kn->parent->priv; return battr->mmap(of->file, kobj, battr, vma); } static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; if (battr->f_mapping) of->file->f_mapping = battr->f_mapping(); return 0; } void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { struct kernfs_node *kn = kobj->sd, *tmp; if (kn && dir) kn = kernfs_find_and_get(kn, dir); else kernfs_get(kn); if (kn && attr) { tmp = kernfs_find_and_get(kn, attr); kernfs_put(kn); kn = tmp; } if (kn) { kernfs_notify(kn); kernfs_put(kn); } } EXPORT_SYMBOL_GPL(sysfs_notify); static const struct kernfs_ops sysfs_file_kfops_empty = { }; static const struct kernfs_ops sysfs_file_kfops_ro = { .seq_show = sysfs_kf_seq_show, }; static const struct kernfs_ops sysfs_file_kfops_wo = { .write = sysfs_kf_write, }; static const struct kernfs_ops sysfs_file_kfops_rw = { .seq_show = sysfs_kf_seq_show, .write = sysfs_kf_write, }; static const struct kernfs_ops sysfs_prealloc_kfops_ro = { .read = sysfs_kf_read, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_wo = { .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_rw = { .read = sysfs_kf_read, .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_bin_kfops_ro = { .read = sysfs_kf_bin_read, }; static const struct kernfs_ops sysfs_bin_kfops_wo = { .write = sysfs_kf_bin_write, }; static const struct kernfs_ops sysfs_bin_kfops_rw = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, }; static const struct kernfs_ops sysfs_bin_kfops_mmap = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, .mmap = sysfs_kf_bin_mmap, .open = sysfs_kf_bin_open, }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, umode_t mode, kuid_t uid, kgid_t gid, const void *ns) { struct kobject *kobj = parent->priv; const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; struct lock_class_key *key = NULL; const struct kernfs_ops *ops = NULL; struct kernfs_node *kn; /* every kobject with an attribute needs a ktype assigned */ if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) return -EINVAL; if (mode & SYSFS_PREALLOC) { if (sysfs_ops->show && sysfs_ops->store) ops = &sysfs_prealloc_kfops_rw; else if (sysfs_ops->show) ops = &sysfs_prealloc_kfops_ro; else if (sysfs_ops->store) ops = &sysfs_prealloc_kfops_wo; } else { if (sysfs_ops->show && sysfs_ops->store) ops = &sysfs_file_kfops_rw; else if (sysfs_ops->show) ops = &sysfs_file_kfops_ro; else if (sysfs_ops->store) ops = &sysfs_file_kfops_wo; } if (!ops) ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, PAGE_SIZE, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0; } int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct bin_attribute *battr, umode_t mode, kuid_t uid, kgid_t gid, const void *ns) { const struct attribute *attr = &battr->attr; struct lock_class_key *key = NULL; const struct kernfs_ops *ops; struct kernfs_node *kn; if (battr->mmap) ops = &sysfs_bin_kfops_mmap; else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; else if (battr->read) ops = &sysfs_bin_kfops_ro; else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, battr->size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0; } /** * sysfs_create_file_ns - create an attribute file for an object with custom ns * @kobj: object we're creating for * @attr: attribute descriptor * @ns: namespace the new file should belong to */ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { kuid_t uid; kgid_t gid; if (WARN_ON(!kobj || !kobj->sd || !attr)) return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr) { int err = 0; int i; for (i = 0; ptr[i] && !err; i++) err = sysfs_create_file(kobj, ptr[i]); if (err) while (--i >= 0) sysfs_remove_file(kobj, ptr[i]); return err; } EXPORT_SYMBOL_GPL(sysfs_create_files); /** * sysfs_add_file_to_group - add an attribute file to a pre-existing group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error; if (group) { parent = kernfs_find_and_get(kobj->sd, group); } else { parent = kobj->sd; kernfs_get(parent); } if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid, NULL); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); /** * sysfs_chmod_file - update the modified mode value on an object attribute. * @kobj: object we're acting for. * @attr: attribute descriptor. * @mode: file permissions. * */ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { struct kernfs_node *kn; struct iattr newattrs; int rc; kn = kernfs_find_and_get(kobj->sd, attr->name); if (!kn) return -ENOENT; newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; rc = kernfs_setattr(kn, &newattrs); kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); /** * sysfs_break_active_protection - break "active" protection * @kobj: The kernel object @attr is associated with. * @attr: The attribute to break the "active" protection for. * * With sysfs, just like kernfs, deletion of an attribute is postponed until * all active .show() and .store() callbacks have finished unless this function * is called. Hence this function is useful in methods that implement self * deletion. */ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { struct kernfs_node *kn; kobject_get(kobj); kn = kernfs_find_and_get(kobj->sd, attr->name); if (kn) kernfs_break_active_protection(kn); return kn; } EXPORT_SYMBOL_GPL(sysfs_break_active_protection); /** * sysfs_unbreak_active_protection - restore "active" protection * @kn: Pointer returned by sysfs_break_active_protection(). * * Undo the effects of sysfs_break_active_protection(). Since this function * calls kernfs_put() on the kernfs node that corresponds to the 'attr' * argument passed to sysfs_break_active_protection() that attribute may have * been removed between the sysfs_break_active_protection() and * sysfs_unbreak_active_protection() calls, it is not safe to access @kn after * this function has returned. */ void sysfs_unbreak_active_protection(struct kernfs_node *kn) { struct kobject *kobj = kn->parent->priv; kernfs_unbreak_active_protection(kn); kernfs_put(kn); kobject_put(kobj); } EXPORT_SYMBOL_GPL(sysfs_unbreak_active_protection); /** * sysfs_remove_file_ns - remove an object attribute with a custom ns tag * @kobj: object we're acting for * @attr: attribute descriptor * @ns: namespace tag of the file to remove * * Hash the attribute name and namespace tag and kill the victim. */ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { struct kernfs_node *parent = kobj->sd; kernfs_remove_by_name_ns(parent, attr->name, ns); } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); /** * sysfs_remove_file_self - remove an object attribute from its own method * @kobj: object we're acting for * @attr: attribute descriptor * * See kernfs_remove_self() for details. */ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; bool ret; kn = kernfs_find_and_get(parent, attr->name); if (WARN_ON_ONCE(!kn)) return false; ret = kernfs_remove_self(kn); kernfs_put(kn); return ret; } EXPORT_SYMBOL_GPL(sysfs_remove_file_self); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { int i; for (i = 0; ptr[i]; i++) sysfs_remove_file(kobj, ptr[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_files); /** * sysfs_remove_file_from_group - remove an attribute file from a group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; if (group) { parent = kernfs_find_and_get(kobj->sd, group); } else { parent = kobj->sd; kernfs_get(parent); } if (parent) { kernfs_remove_by_name(parent, attr->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); /** * sysfs_create_bin_file - create binary file for object. * @kobj: object. * @attr: attribute descriptor. */ int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { kuid_t uid; kgid_t gid; if (WARN_ON(!kobj || !kobj->sd || !attr)) return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid, gid, NULL); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); /** * sysfs_remove_bin_file - remove binary file for object. * @kobj: object. * @attr: attribute descriptor. */ void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid, kgid_t kgid) { struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; return kernfs_setattr(kn, &newattrs); } /** * sysfs_link_change_owner - change owner of a sysfs file. * @kobj: object of the kernfs_node the symlink is located in. * @targ: object of the kernfs_node the symlink points to. * @name: name of the link. * @kuid: new owner's kuid * @kgid: new owner's kgid * * This function looks up the sysfs symlink entry @name under @kobj and changes * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of * @targ. * * Returns 0 on success or error code on failure. */ int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { struct kernfs_node *kn = NULL; int error; if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs) return -EINVAL; error = -ENOENT; kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns); if (!kn) goto out; error = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; error = internal_change_owner(kn, kuid, kgid); out: kernfs_put(kn); return error; } /** * sysfs_file_change_owner - change owner of a sysfs file. * @kobj: object. * @name: name of the file to change. * @kuid: new owner's kuid * @kgid: new owner's kgid * * This function looks up the sysfs entry @name under @kobj and changes the * ownership to @kuid/@kgid. * * Returns 0 on success or error code on failure. */ int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { struct kernfs_node *kn; int error; if (!name) return -EINVAL; if (!kobj->state_in_sysfs) return -EINVAL; kn = kernfs_find_and_get(kobj->sd, name); if (!kn) return -ENOENT; error = internal_change_owner(kn, kuid, kgid); kernfs_put(kn); return error; } EXPORT_SYMBOL_GPL(sysfs_file_change_owner); /** * sysfs_change_owner - change owner of the given object. * @kobj: object. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Change the owner of the default directory, files, groups, and attributes of * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs * entries for a kobject are added by driver core. In summary, * sysfs_change_owner() takes care of the default directory entry for @kobj, * the default attributes associated with the ktype of @kobj and the default * attributes associated with the ktype of @kobj. * Additional properties not added by driver core have to be changed by the * driver or subsystem which created them. This is similar to how * driver/subsystem specific entries are removed. * * Returns 0 on success or error code on failure. */ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { int error; const struct kobj_type *ktype; if (!kobj->state_in_sysfs) return -EINVAL; /* Change the owner of the kobject itself. */ error = internal_change_owner(kobj->sd, kuid, kgid); if (error) return error; ktype = get_ktype(kobj); if (ktype) { /* * Change owner of the default groups associated with the * ktype of @kobj. */ error = sysfs_groups_change_owner(kobj, ktype->default_groups, kuid, kgid); if (error) return error; } return 0; } EXPORT_SYMBOL_GPL(sysfs_change_owner); /** * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer. * @buf: start of PAGE_SIZE buffer. * @fmt: format * @...: optional arguments to @format * * * Returns number of characters written to @buf. */ int sysfs_emit(char *buf, const char *fmt, ...) { va_list args; int len; if (WARN(!buf || offset_in_page(buf), "invalid sysfs_emit: buf:%p\n", buf)) return 0; va_start(args, fmt); len = vscnprintf(buf, PAGE_SIZE, fmt, args); va_end(args); return len; } EXPORT_SYMBOL_GPL(sysfs_emit); /** * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer. * @buf: start of PAGE_SIZE buffer. * @at: offset in @buf to start write in bytes * @at must be >= 0 && < PAGE_SIZE * @fmt: format * @...: optional arguments to @fmt * * * Returns number of characters written starting at &@buf[@at]. */ int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { va_list args; int len; if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE, "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at)) return 0; va_start(args, fmt); len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args); va_end(args); return len; } EXPORT_SYMBOL_GPL(sysfs_emit_at); |
| 3 1 3 1 1 4 1 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat KonePure driver for Linux * * Copyright (c) 2012 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat KonePure is a smaller version of KoneXTD with less buttons and lights. */ #include <linux/types.h> #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" enum { KONEPURE_MOUSE_REPORT_NUMBER_BUTTON = 3, }; struct konepure_mouse_report_button { uint8_t report_number; /* always KONEPURE_MOUSE_REPORT_NUMBER_BUTTON */ uint8_t zero; uint8_t type; uint8_t data1; uint8_t data2; uint8_t zero2; uint8_t unknown[2]; } __packed; ROCCAT_COMMON2_BIN_ATTRIBUTE_W(control, 0x04, 0x03); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(actual_profile, 0x05, 0x03); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(profile_settings, 0x06, 0x1f); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(profile_buttons, 0x07, 0x3b); ROCCAT_COMMON2_BIN_ATTRIBUTE_W(macro, 0x08, 0x0822); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(info, 0x09, 0x06); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(tcu, 0x0c, 0x04); ROCCAT_COMMON2_BIN_ATTRIBUTE_R(tcu_image, 0x0c, 0x0404); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(sensor, 0x0f, 0x06); ROCCAT_COMMON2_BIN_ATTRIBUTE_W(talk, 0x10, 0x10); static struct bin_attribute *konepure_bin_attrs[] = { &bin_attr_actual_profile, &bin_attr_control, &bin_attr_info, &bin_attr_talk, &bin_attr_macro, &bin_attr_sensor, &bin_attr_tcu, &bin_attr_tcu_image, &bin_attr_profile_settings, &bin_attr_profile_buttons, NULL, }; static const struct attribute_group konepure_group = { .bin_attrs = konepure_bin_attrs, }; static const struct attribute_group *konepure_groups[] = { &konepure_group, NULL, }; static const struct class konepure_class = { .name = "konepure", .dev_groups = konepure_groups, }; static int konepure_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct roccat_common2_device *konepure; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) { hid_set_drvdata(hdev, NULL); return 0; } konepure = kzalloc(sizeof(*konepure), GFP_KERNEL); if (!konepure) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, konepure); retval = roccat_common2_device_init_struct(usb_dev, konepure); if (retval) { hid_err(hdev, "couldn't init KonePure device\n"); goto exit_free; } retval = roccat_connect(&konepure_class, hdev, sizeof(struct konepure_mouse_report_button)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { konepure->chrdev_minor = retval; konepure->roccat_claimed = 1; } return 0; exit_free: kfree(konepure); return retval; } static void konepure_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct roccat_common2_device *konepure; if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return; konepure = hid_get_drvdata(hdev); if (konepure->roccat_claimed) roccat_disconnect(konepure->chrdev_minor); kfree(konepure); } static int konepure_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = konepure_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void konepure_remove(struct hid_device *hdev) { konepure_remove_specials(hdev); hid_hw_stop(hdev); } static int konepure_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct roccat_common2_device *konepure = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return 0; if (data[0] != KONEPURE_MOUSE_REPORT_NUMBER_BUTTON) return 0; if (konepure != NULL && konepure->roccat_claimed) roccat_report_event(konepure->chrdev_minor, data); return 0; } static const struct hid_device_id konepure_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONEPURE) }, { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONEPURE_OPTICAL) }, { } }; MODULE_DEVICE_TABLE(hid, konepure_devices); static struct hid_driver konepure_driver = { .name = "konepure", .id_table = konepure_devices, .probe = konepure_probe, .remove = konepure_remove, .raw_event = konepure_raw_event }; static int __init konepure_init(void) { int retval; retval = class_register(&konepure_class); if (retval) return retval; retval = hid_register_driver(&konepure_driver); if (retval) class_unregister(&konepure_class); return retval; } static void __exit konepure_exit(void) { hid_unregister_driver(&konepure_driver); class_unregister(&konepure_class); } module_init(konepure_init); module_exit(konepure_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat KonePure/Optical driver"); MODULE_LICENSE("GPL v2"); |
| 53 2 8 335 333 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #ifndef _LINUX_SIPHASH_H #define _LINUX_SIPHASH_H #include <linux/types.h> #include <linux/kernel.h> #define SIPHASH_ALIGNMENT __alignof__(u64) typedef struct { u64 key[2]; } siphash_key_t; #define siphash_aligned_key_t siphash_key_t __aligned(16) static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); u64 siphash_3u64(const u64 a, const u64 b, const u64 c, const siphash_key_t *key); u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, const siphash_key_t *key); u64 siphash_1u32(const u32 a, const siphash_key_t *key); u64 siphash_3u32(const u32 a, const u32 b, const u32 c, const siphash_key_t *key); static inline u64 siphash_2u32(const u32 a, const u32 b, const siphash_key_t *key) { return siphash_1u64((u64)b << 32 | a, key); } static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const siphash_key_t *key) { return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); } static inline u64 ___siphash_aligned(const __le64 *data, size_t len, const siphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return siphash_1u32(le32_to_cpup((const __le32 *)data), key); if (__builtin_constant_p(len) && len == 8) return siphash_1u64(le64_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 16) return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 24) return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 32) return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), le64_to_cpu(data[3]), key); return __siphash_aligned(data, len, key); } /** * siphash - compute 64-bit siphash PRF value * @data: buffer to hash * @size: size of @data * @key: the siphash key */ static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); return ___siphash_aligned(data, len, key); } #define HSIPHASH_ALIGNMENT __alignof__(unsigned long) typedef struct { unsigned long key[2]; } hsiphash_key_t; u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, const hsiphash_key_t *key); u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const hsiphash_key_t *key); static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, const hsiphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return hsiphash_1u32(le32_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 8) return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 12) return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 16) return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), le32_to_cpu(data[3]), key); return __hsiphash_aligned(data, len, key); } /** * hsiphash - compute 32-bit hsiphash PRF value * @data: buffer to hash * @size: size of @data * @key: the hsiphash key */ static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); return ___hsiphash_aligned(data, len, key); } /* * These macros expose the raw SipHash and HalfSipHash permutations. * Do not use them directly! If you think you have a use for them, * be sure to CC the maintainer of this file explaining why. */ #define SIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \ (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \ (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \ (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32)) #define SIPHASH_CONST_0 0x736f6d6570736575ULL #define SIPHASH_CONST_1 0x646f72616e646f6dULL #define SIPHASH_CONST_2 0x6c7967656e657261ULL #define SIPHASH_CONST_3 0x7465646279746573ULL #define HSIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \ (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \ (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \ (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16)) #define HSIPHASH_CONST_0 0U #define HSIPHASH_CONST_1 0U #define HSIPHASH_CONST_2 0x6c796765U #define HSIPHASH_CONST_3 0x74656462U #endif /* _LINUX_SIPHASH_H */ |
| 13 1 1 1 30 13 24 24 11 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * Lauro Ramos Venancio <lauro.venancio@openbossa.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/tcp_states.h> #include <linux/nfc.h> #include <linux/export.h> #include <linux/kcov.h> #include "nfc.h" static struct nfc_sock_list raw_sk_list = { .lock = __RW_LOCK_UNLOCKED(raw_sk_list.lock) }; static void nfc_sock_link(struct nfc_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } static void nfc_sock_unlink(struct nfc_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } static void rawsock_write_queue_purge(struct sock *sk) { pr_debug("sk=%p\n", sk); spin_lock_bh(&sk->sk_write_queue.lock); __skb_queue_purge(&sk->sk_write_queue); nfc_rawsock(sk)->tx_work_scheduled = false; spin_unlock_bh(&sk->sk_write_queue.lock); } static void rawsock_report_error(struct sock *sk, int err) { pr_debug("sk=%p err=%d\n", sk, err); sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_err = -err; sk_error_report(sk); rawsock_write_queue_purge(sk); } static int rawsock_release(struct socket *sock) { struct sock *sk = sock->sk; pr_debug("sock=%p sk=%p\n", sock, sk); if (!sk) return 0; if (sock->type == SOCK_RAW) nfc_sock_unlink(&raw_sk_list, sk); sock_orphan(sk); sock_put(sk); return 0; } static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, int len, int flags) { struct sock *sk = sock->sk; struct sockaddr_nfc *addr = (struct sockaddr_nfc *)_addr; struct nfc_dev *dev; int rc = 0; pr_debug("sock=%p sk=%p flags=%d\n", sock, sk, flags); if (!addr || len < sizeof(struct sockaddr_nfc) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); lock_sock(sk); if (sock->state == SS_CONNECTED) { rc = -EISCONN; goto error; } dev = nfc_get_device(addr->dev_idx); if (!dev) { rc = -ENODEV; goto error; } if (addr->target_idx > dev->target_next_idx - 1 || addr->target_idx < dev->target_next_idx - dev->n_targets) { rc = -EINVAL; goto put_dev; } rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol); if (rc) goto put_dev; nfc_rawsock(sk)->dev = dev; nfc_rawsock(sk)->target_idx = addr->target_idx; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; sk->sk_state_change(sk); release_sock(sk); return 0; put_dev: nfc_put_device(dev); error: release_sock(sk); return rc; } static int rawsock_add_header(struct sk_buff *skb) { *(u8 *)skb_push(skb, NFC_HEADER_SIZE) = 0; return 0; } static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb, int err) { struct sock *sk = (struct sock *) context; BUG_ON(in_hardirq()); pr_debug("sk=%p err=%d\n", sk, err); if (err) goto error; err = rawsock_add_header(skb); if (err) goto error_skb; err = sock_queue_rcv_skb(sk, skb); if (err) goto error_skb; spin_lock_bh(&sk->sk_write_queue.lock); if (!skb_queue_empty(&sk->sk_write_queue)) schedule_work(&nfc_rawsock(sk)->tx_work); else nfc_rawsock(sk)->tx_work_scheduled = false; spin_unlock_bh(&sk->sk_write_queue.lock); sock_put(sk); return; error_skb: kfree_skb(skb); error: rawsock_report_error(sk, err); sock_put(sk); } static void rawsock_tx_work(struct work_struct *work) { struct sock *sk = to_rawsock_sk(work); struct nfc_dev *dev = nfc_rawsock(sk)->dev; u32 target_idx = nfc_rawsock(sk)->target_idx; struct sk_buff *skb; int rc; pr_debug("sk=%p target_idx=%u\n", sk, target_idx); if (sk->sk_shutdown & SEND_SHUTDOWN) { rawsock_write_queue_purge(sk); return; } skb = skb_dequeue(&sk->sk_write_queue); kcov_remote_start_common(skb_get_kcov_handle(skb)); sock_hold(sk); rc = nfc_data_exchange(dev, target_idx, skb, rawsock_data_exchange_complete, sk); if (rc) { rawsock_report_error(sk, rc); sock_put(sk); } kcov_remote_stop(); } static int rawsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_dev *dev = nfc_rawsock(sk)->dev; struct sk_buff *skb; int rc; pr_debug("sock=%p sk=%p len=%zu\n", sock, sk, len); if (msg->msg_namelen) return -EOPNOTSUPP; if (sock->state != SS_CONNECTED) return -ENOTCONN; skb = nfc_alloc_send_skb(dev, sk, msg->msg_flags, len, &rc); if (skb == NULL) return rc; rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); return rc; } spin_lock_bh(&sk->sk_write_queue.lock); __skb_queue_tail(&sk->sk_write_queue, skb); if (!nfc_rawsock(sk)->tx_work_scheduled) { schedule_work(&nfc_rawsock(sk)->tx_work); nfc_rawsock(sk)->tx_work_scheduled = true; } spin_unlock_bh(&sk->sk_write_queue.lock); return len; } static int rawsock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied; int rc; pr_debug("sock=%p sk=%p len=%zu flags=%d\n", sock, sk, len, flags); skb = skb_recv_datagram(sk, flags, &rc); if (!skb) return rc; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } rc = skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); return rc ? : copied; } static const struct proto_ops rawsock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .release = rawsock_release, .bind = sock_no_bind, .connect = rawsock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = rawsock_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops rawsock_raw_ops = { .family = PF_NFC, .owner = THIS_MODULE, .release = rawsock_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, }; static void rawsock_destruct(struct sock *sk) { pr_debug("sk=%p\n", sk); if (sk->sk_state == TCP_ESTABLISHED) { nfc_deactivate_target(nfc_rawsock(sk)->dev, nfc_rawsock(sk)->target_idx, NFC_TARGET_MODE_IDLE); nfc_put_device(nfc_rawsock(sk)->dev); } skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Freeing alive NFC raw socket %p\n", sk); return; } } static int rawsock_create(struct net *net, struct socket *sock, const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; pr_debug("sock=%p\n", sock); if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW)) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW) { if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; sock->ops = &rawsock_raw_ops; } else { sock->ops = &rawsock_ops; } sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sk->sk_protocol = nfc_proto->id; sk->sk_destruct = rawsock_destruct; sock->state = SS_UNCONNECTED; if (sock->type == SOCK_RAW) nfc_sock_link(&raw_sk_list, sk); else { INIT_WORK(&nfc_rawsock(sk)->tx_work, rawsock_tx_work); nfc_rawsock(sk)->tx_work_scheduled = false; } return 0; } void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb, u8 payload_type, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&raw_sk_list.lock); sk_for_each(sk, &raw_sk_list.head) { if (!skb_copy) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (!skb_copy) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = dev ? dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (payload_type << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&raw_sk_list.lock); kfree_skb(skb_copy); } EXPORT_SYMBOL(nfc_send_to_raw_sock); static struct proto rawsock_proto = { .name = "NFC_RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct nfc_rawsock), }; static const struct nfc_protocol rawsock_nfc_proto = { .id = NFC_SOCKPROTO_RAW, .proto = &rawsock_proto, .owner = THIS_MODULE, .create = rawsock_create }; int __init rawsock_init(void) { int rc; rc = nfc_proto_register(&rawsock_nfc_proto); return rc; } void rawsock_exit(void) { nfc_proto_unregister(&rawsock_nfc_proto); } |
| 169 2926 2061 2036 2103 1 2 1 2910 2946 2943 2917 3185 20 3185 2917 143 2911 3150 1995 3150 3149 2912 2911 2912 3184 3185 3150 3150 3185 3185 3185 3185 2909 2910 2910 2911 19 3185 2944 196 2910 2945 3185 519 519 519 235 235 235 207 23 381 381 30 6 346 369 369 369 519 41 41 1986 1986 1986 169 169 2 2053 2053 2053 2053 19 19 18 3 2053 6 6 6 6 6 1979 143 1979 1979 143 143 28 2914 28 2913 2912 1 2912 2915 326 2914 2914 2912 2914 2915 2915 2914 2915 169 166 169 3179 3177 3179 3171 3171 1970 3178 2927 2911 19 19 19 20 20 64 2912 2926 2926 19 19 2924 2912 2910 2911 2911 2745 39 2927 2911 45 45 31 20 20 2911 2912 2912 2910 31 2925 2910 12 34 34 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 | // SPDX-License-Identifier: GPL-2.0+ /* * dummy_hcd.c -- Dummy/Loopback USB host and device emulator driver. * * Maintainer: Alan Stern <stern@rowland.harvard.edu> * * Copyright (C) 2003 David Brownell * Copyright (C) 2003-2005 Alan Stern */ /* * This exposes a device side "USB gadget" API, driven by requests to a * Linux-USB host controller driver. USB traffic is simulated; there's * no need for USB hardware. Use this with two other drivers: * * - Gadget driver, responding to requests (device); * - Host-side device driver, as already familiar in Linux. * * Having this all in one kernel can help some stages of development, * bypassing some hardware (and driver) issues. UML could help too. * * Note: The emulation does not include isochronous transfers! */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/ioport.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/list.h> #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/usb.h> #include <linux/usb/gadget.h> #include <linux/usb/hcd.h> #include <linux/scatterlist.h> #include <asm/byteorder.h> #include <linux/io.h> #include <asm/irq.h> #include <asm/unaligned.h> #define DRIVER_DESC "USB Host+Gadget Emulator" #define DRIVER_VERSION "02 May 2005" #define POWER_BUDGET 500 /* in mA; use 8 for low-power port testing */ #define POWER_BUDGET_3 900 /* in mA */ static const char driver_name[] = "dummy_hcd"; static const char driver_desc[] = "USB Host+Gadget Emulator"; static const char gadget_name[] = "dummy_udc"; MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR("David Brownell"); MODULE_LICENSE("GPL"); struct dummy_hcd_module_parameters { bool is_super_speed; bool is_high_speed; unsigned int num; }; static struct dummy_hcd_module_parameters mod_data = { .is_super_speed = false, .is_high_speed = true, .num = 1, }; module_param_named(is_super_speed, mod_data.is_super_speed, bool, S_IRUGO); MODULE_PARM_DESC(is_super_speed, "true to simulate SuperSpeed connection"); module_param_named(is_high_speed, mod_data.is_high_speed, bool, S_IRUGO); MODULE_PARM_DESC(is_high_speed, "true to simulate HighSpeed connection"); module_param_named(num, mod_data.num, uint, S_IRUGO); MODULE_PARM_DESC(num, "number of emulated controllers"); /*-------------------------------------------------------------------------*/ /* gadget side driver data structres */ struct dummy_ep { struct list_head queue; unsigned long last_io; /* jiffies timestamp */ struct usb_gadget *gadget; const struct usb_endpoint_descriptor *desc; struct usb_ep ep; unsigned halted:1; unsigned wedged:1; unsigned already_seen:1; unsigned setup_stage:1; unsigned stream_en:1; }; struct dummy_request { struct list_head queue; /* ep's requests */ struct usb_request req; }; static inline struct dummy_ep *usb_ep_to_dummy_ep(struct usb_ep *_ep) { return container_of(_ep, struct dummy_ep, ep); } static inline struct dummy_request *usb_request_to_dummy_request (struct usb_request *_req) { return container_of(_req, struct dummy_request, req); } /*-------------------------------------------------------------------------*/ /* * Every device has ep0 for control requests, plus up to 30 more endpoints, * in one of two types: * * - Configurable: direction (in/out), type (bulk, iso, etc), and endpoint * number can be changed. Names like "ep-a" are used for this type. * * - Fixed Function: in other cases. some characteristics may be mutable; * that'd be hardware-specific. Names like "ep12out-bulk" are used. * * Gadget drivers are responsible for not setting up conflicting endpoint * configurations, illegal or unsupported packet lengths, and so on. */ static const char ep0name[] = "ep0"; static const struct { const char *name; const struct usb_ep_caps caps; } ep_info[] = { #define EP_INFO(_name, _caps) \ { \ .name = _name, \ .caps = _caps, \ } /* we don't provide isochronous endpoints since we don't support them */ #define TYPE_BULK_OR_INT (USB_EP_CAPS_TYPE_BULK | USB_EP_CAPS_TYPE_INT) /* everyone has ep0 */ EP_INFO(ep0name, USB_EP_CAPS(USB_EP_CAPS_TYPE_CONTROL, USB_EP_CAPS_DIR_ALL)), /* act like a pxa250: fifteen fixed function endpoints */ EP_INFO("ep1in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep2out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep3in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep4out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep5in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep6in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep7out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep8in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep9out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep10in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep11in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep12out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep13in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep14out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep15in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), /* or like sa1100: two fixed function endpoints */ EP_INFO("ep1out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep2in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), /* and now some generic EPs so we have enough in multi config */ EP_INFO("ep-aout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-bin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-cout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-dout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-ein", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-fout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-gin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-hout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-iout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-jin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-kout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-lin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-mout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), #undef EP_INFO }; #define DUMMY_ENDPOINTS ARRAY_SIZE(ep_info) /*-------------------------------------------------------------------------*/ #define FIFO_SIZE 64 struct urbp { struct urb *urb; struct list_head urbp_list; struct sg_mapping_iter miter; u32 miter_started; }; enum dummy_rh_state { DUMMY_RH_RESET, DUMMY_RH_SUSPENDED, DUMMY_RH_RUNNING }; struct dummy_hcd { struct dummy *dum; enum dummy_rh_state rh_state; struct timer_list timer; u32 port_status; u32 old_status; unsigned long re_timeout; struct usb_device *udev; struct list_head urbp_list; struct urbp *next_frame_urbp; u32 stream_en_ep; u8 num_stream[30 / 2]; unsigned active:1; unsigned old_active:1; unsigned resuming:1; }; struct dummy { spinlock_t lock; /* * DEVICE/GADGET side support */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; /* * HOST side support */ struct dummy_hcd *hs_hcd; struct dummy_hcd *ss_hcd; }; static inline struct dummy_hcd *hcd_to_dummy_hcd(struct usb_hcd *hcd) { return (struct dummy_hcd *) (hcd->hcd_priv); } static inline struct usb_hcd *dummy_hcd_to_hcd(struct dummy_hcd *dum) { return container_of((void *) dum, struct usb_hcd, hcd_priv); } static inline struct device *dummy_dev(struct dummy_hcd *dum) { return dummy_hcd_to_hcd(dum)->self.controller; } static inline struct device *udc_dev(struct dummy *dum) { return dum->gadget.dev.parent; } static inline struct dummy *ep_to_dummy(struct dummy_ep *ep) { return container_of(ep->gadget, struct dummy, gadget); } static inline struct dummy_hcd *gadget_to_dummy_hcd(struct usb_gadget *gadget) { struct dummy *dum = container_of(gadget, struct dummy, gadget); if (dum->gadget.speed == USB_SPEED_SUPER) return dum->ss_hcd; else return dum->hs_hcd; } static inline struct dummy *gadget_dev_to_dummy(struct device *dev) { return container_of(dev, struct dummy, gadget.dev); } /*-------------------------------------------------------------------------*/ /* DEVICE/GADGET SIDE UTILITY ROUTINES */ /* called with spinlock held */ static void nuke(struct dummy *dum, struct dummy_ep *ep) { while (!list_empty(&ep->queue)) { struct dummy_request *req; req = list_entry(ep->queue.next, struct dummy_request, queue); list_del_init(&req->queue); req->req.status = -ESHUTDOWN; spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); } } /* caller must hold lock */ static void stop_activity(struct dummy *dum) { int i; /* prevent any more requests */ dum->address = 0; /* The timer is left running so that outstanding URBs can fail */ /* nuke any pending requests first, so driver i/o is quiesced */ for (i = 0; i < DUMMY_ENDPOINTS; ++i) nuke(dum, &dum->ep[i]); /* driver now does any non-usb quiescing necessary */ } /** * set_link_state_by_speed() - Sets the current state of the link according to * the hcd speed * @dum_hcd: pointer to the dummy_hcd structure to update the link state for * * This function updates the port_status according to the link state and the * speed of the hcd. */ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) { struct dummy *dum = dum_hcd->dum; if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) { if ((dum_hcd->port_status & USB_SS_PORT_STAT_POWER) == 0) { dum_hcd->port_status = 0; } else if (!dum->pullup || dum->udc_suspended) { /* UDC suspend must cause a disconnect */ dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE); if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) != 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); } else { /* device is connected and not suspended */ dum_hcd->port_status |= (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_SPEED_5GBPS) ; if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) && (dum_hcd->port_status & USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } else { if ((dum_hcd->port_status & USB_PORT_STAT_POWER) == 0) { dum_hcd->port_status = 0; } else if (!dum->pullup || dum->udc_suspended) { /* UDC suspend must cause a disconnect */ dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED | USB_PORT_STAT_SUSPEND); if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) != 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); } else { dum_hcd->port_status |= USB_PORT_STAT_CONNECTION; if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0) dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; else if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } } /* caller must hold lock */ static void set_link_state(struct dummy_hcd *dum_hcd) __must_hold(&dum->lock) { struct dummy *dum = dum_hcd->dum; unsigned int power_bit; dum_hcd->active = 0; if (dum->pullup) if ((dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 && dum->gadget.speed != USB_SPEED_SUPER) || (dummy_hcd_to_hcd(dum_hcd)->speed != HCD_USB3 && dum->gadget.speed == USB_SPEED_SUPER)) return; set_link_state_by_speed(dum_hcd); power_bit = (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 ? USB_SS_PORT_STAT_POWER : USB_PORT_STAT_POWER); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0 || dum_hcd->active) dum_hcd->resuming = 0; /* Currently !connected or in reset */ if ((dum_hcd->port_status & power_bit) == 0 || (dum_hcd->port_status & USB_PORT_STAT_RESET) != 0) { unsigned int disconnect = power_bit & dum_hcd->old_status & (~dum_hcd->port_status); unsigned int reset = USB_PORT_STAT_RESET & (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); ++dum->callback_usage; spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); spin_lock(&dum->lock); --dum->callback_usage; } } else if (dum_hcd->active != dum_hcd->old_active && dum->ints_enabled) { ++dum->callback_usage; spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); spin_lock(&dum->lock); --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; dum_hcd->old_active = dum_hcd->active; } /*-------------------------------------------------------------------------*/ /* DEVICE/GADGET SIDE DRIVER * * This only tracks gadget state. All the work is done when the host * side tries some (emulated) i/o operation. Real device controller * drivers would do real i/o using dma, fifos, irqs, timers, etc. */ #define is_enabled(dum) \ (dum->port_status & USB_PORT_STAT_ENABLE) static int dummy_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc) { struct dummy *dum; struct dummy_hcd *dum_hcd; struct dummy_ep *ep; unsigned max; int retval; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || !desc || ep->desc || _ep->name == ep0name || desc->bDescriptorType != USB_DT_ENDPOINT) return -EINVAL; dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; dum_hcd = gadget_to_dummy_hcd(&dum->gadget); if (!is_enabled(dum_hcd)) return -ESHUTDOWN; /* * For HS/FS devices only bits 0..10 of the wMaxPacketSize represent the * maximum packet size. * For SS devices the wMaxPacketSize is limited by 1024. */ max = usb_endpoint_maxp(desc); /* drivers must not request bad settings, since lower levels * (hardware or its drivers) may not check. some endpoints * can't do iso, many have maxpacket limitations, etc. * * since this "hardware" driver is here to help debugging, we * have some extra sanity checks. (there could be more though, * especially for "ep9out" style fixed function ones.) */ retval = -EINVAL; switch (usb_endpoint_type(desc)) { case USB_ENDPOINT_XFER_BULK: if (strstr(ep->ep.name, "-iso") || strstr(ep->ep.name, "-int")) { goto done; } switch (dum->gadget.speed) { case USB_SPEED_SUPER: if (max == 1024) break; goto done; case USB_SPEED_HIGH: if (max == 512) break; goto done; case USB_SPEED_FULL: if (max == 8 || max == 16 || max == 32 || max == 64) /* we'll fake any legal size */ break; /* save a return statement */ fallthrough; default: goto done; } break; case USB_ENDPOINT_XFER_INT: if (strstr(ep->ep.name, "-iso")) /* bulk is ok */ goto done; /* real hardware might not handle all packet sizes */ switch (dum->gadget.speed) { case USB_SPEED_SUPER: case USB_SPEED_HIGH: if (max <= 1024) break; /* save a return statement */ fallthrough; case USB_SPEED_FULL: if (max <= 64) break; /* save a return statement */ fallthrough; default: if (max <= 8) break; goto done; } break; case USB_ENDPOINT_XFER_ISOC: if (strstr(ep->ep.name, "-bulk") || strstr(ep->ep.name, "-int")) goto done; /* real hardware might not handle all packet sizes */ switch (dum->gadget.speed) { case USB_SPEED_SUPER: case USB_SPEED_HIGH: if (max <= 1024) break; /* save a return statement */ fallthrough; case USB_SPEED_FULL: if (max <= 1023) break; /* save a return statement */ fallthrough; default: goto done; } break; default: /* few chips support control except on ep0 */ goto done; } _ep->maxpacket = max; if (usb_ss_max_streams(_ep->comp_desc)) { if (!usb_endpoint_xfer_bulk(desc)) { dev_err(udc_dev(dum), "Can't enable stream support on " "non-bulk ep %s\n", _ep->name); return -EINVAL; } ep->stream_en = 1; } ep->desc = desc; dev_dbg(udc_dev(dum), "enabled %s (ep%d%s-%s) maxpacket %d stream %s\n", _ep->name, desc->bEndpointAddress & 0x0f, (desc->bEndpointAddress & USB_DIR_IN) ? "in" : "out", usb_ep_type_string(usb_endpoint_type(desc)), max, ep->stream_en ? "enabled" : "disabled"); /* at this point real hardware should be NAKing transfers * to that endpoint, until a buffer is queued to it. */ ep->halted = ep->wedged = 0; retval = 0; done: return retval; } static int dummy_disable(struct usb_ep *_ep) { struct dummy_ep *ep; struct dummy *dum; unsigned long flags; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || !ep->desc || _ep->name == ep0name) return -EINVAL; dum = ep_to_dummy(ep); spin_lock_irqsave(&dum->lock, flags); ep->desc = NULL; ep->stream_en = 0; nuke(dum, ep); spin_unlock_irqrestore(&dum->lock, flags); dev_dbg(udc_dev(dum), "disabled %s\n", _ep->name); return 0; } static struct usb_request *dummy_alloc_request(struct usb_ep *_ep, gfp_t mem_flags) { struct dummy_request *req; if (!_ep) return NULL; req = kzalloc(sizeof(*req), mem_flags); if (!req) return NULL; INIT_LIST_HEAD(&req->queue); return &req->req; } static void dummy_free_request(struct usb_ep *_ep, struct usb_request *_req) { struct dummy_request *req; if (!_ep || !_req) { WARN_ON(1); return; } req = usb_request_to_dummy_request(_req); WARN_ON(!list_empty(&req->queue)); kfree(req); } static void fifo_complete(struct usb_ep *ep, struct usb_request *req) { } static int dummy_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t mem_flags) { struct dummy_ep *ep; struct dummy_request *req; struct dummy *dum; struct dummy_hcd *dum_hcd; unsigned long flags; req = usb_request_to_dummy_request(_req); if (!_req || !list_empty(&req->queue) || !_req->complete) return -EINVAL; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || (!ep->desc && _ep->name != ep0name)) return -EINVAL; dum = ep_to_dummy(ep); dum_hcd = gadget_to_dummy_hcd(&dum->gadget); if (!dum->driver || !is_enabled(dum_hcd)) return -ESHUTDOWN; #if 0 dev_dbg(udc_dev(dum), "ep %p queue req %p to %s, len %d buf %p\n", ep, _req, _ep->name, _req->length, _req->buf); #endif _req->status = -EINPROGRESS; _req->actual = 0; spin_lock_irqsave(&dum->lock, flags); /* implement an emulated single-request FIFO */ if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) && list_empty(&dum->fifo_req.queue) && list_empty(&ep->queue) && _req->length <= FIFO_SIZE) { req = &dum->fifo_req; req->req = *_req; req->req.buf = dum->fifo_buf; memcpy(dum->fifo_buf, _req->buf, _req->length); req->req.context = dum; req->req.complete = fifo_complete; list_add_tail(&req->queue, &ep->queue); spin_unlock(&dum->lock); _req->actual = _req->length; _req->status = 0; usb_gadget_giveback_request(_ep, _req); spin_lock(&dum->lock); } else list_add_tail(&req->queue, &ep->queue); spin_unlock_irqrestore(&dum->lock, flags); /* real hardware would likely enable transfers here, in case * it'd been left NAKing. */ return 0; } static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) { struct dummy_ep *ep; struct dummy *dum; int retval = -EINVAL; unsigned long flags; struct dummy_request *req = NULL, *iter; if (!_ep || !_req) return retval; ep = usb_ep_to_dummy_ep(_ep); dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; local_irq_save(flags); spin_lock(&dum->lock); list_for_each_entry(iter, &ep->queue, queue) { if (&iter->req != _req) continue; list_del_init(&iter->queue); _req->status = -ECONNRESET; req = iter; retval = 0; break; } spin_unlock(&dum->lock); if (retval == 0) { dev_dbg(udc_dev(dum), "dequeued req %p from %s, len %d buf %p\n", req, _ep->name, _req->length, _req->buf); usb_gadget_giveback_request(_ep, _req); } local_irq_restore(flags); return retval; } static int dummy_set_halt_and_wedge(struct usb_ep *_ep, int value, int wedged) { struct dummy_ep *ep; struct dummy *dum; if (!_ep) return -EINVAL; ep = usb_ep_to_dummy_ep(_ep); dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; if (!value) ep->halted = ep->wedged = 0; else if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) && !list_empty(&ep->queue)) return -EAGAIN; else { ep->halted = 1; if (wedged) ep->wedged = 1; } /* FIXME clear emulated data toggle too */ return 0; } static int dummy_set_halt(struct usb_ep *_ep, int value) { return dummy_set_halt_and_wedge(_ep, value, 0); } static int dummy_set_wedge(struct usb_ep *_ep) { if (!_ep || _ep->name == ep0name) return -EINVAL; return dummy_set_halt_and_wedge(_ep, 1, 1); } static const struct usb_ep_ops dummy_ep_ops = { .enable = dummy_enable, .disable = dummy_disable, .alloc_request = dummy_alloc_request, .free_request = dummy_free_request, .queue = dummy_queue, .dequeue = dummy_dequeue, .set_halt = dummy_set_halt, .set_wedge = dummy_set_wedge, }; /*-------------------------------------------------------------------------*/ /* there are both host and device side versions of this call ... */ static int dummy_g_get_frame(struct usb_gadget *_gadget) { struct timespec64 ts64; ktime_get_ts64(&ts64); return ts64.tv_nsec / NSEC_PER_MSEC; } static int dummy_wakeup(struct usb_gadget *_gadget) { struct dummy_hcd *dum_hcd; dum_hcd = gadget_to_dummy_hcd(_gadget); if (!(dum_hcd->dum->devstatus & ((1 << USB_DEVICE_B_HNP_ENABLE) | (1 << USB_DEVICE_REMOTE_WAKEUP)))) return -EINVAL; if ((dum_hcd->port_status & USB_PORT_STAT_CONNECTION) == 0) return -ENOLINK; if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) return -EIO; /* FIXME: What if the root hub is suspended but the port isn't? */ /* hub notices our request, issues downstream resume, etc */ dum_hcd->resuming = 1; dum_hcd->re_timeout = jiffies + msecs_to_jiffies(20); mod_timer(&dummy_hcd_to_hcd(dum_hcd)->rh_timer, dum_hcd->re_timeout); return 0; } static int dummy_set_selfpowered(struct usb_gadget *_gadget, int value) { struct dummy *dum; _gadget->is_selfpowered = (value != 0); dum = gadget_to_dummy_hcd(_gadget)->dum; if (value) dum->devstatus |= (1 << USB_DEVICE_SELF_POWERED); else dum->devstatus &= ~(1 << USB_DEVICE_SELF_POWERED); return 0; } static void dummy_udc_update_ep0(struct dummy *dum) { if (dum->gadget.speed == USB_SPEED_SUPER) dum->ep[0].ep.maxpacket = 9; else dum->ep[0].ep.maxpacket = 64; } static int dummy_pullup(struct usb_gadget *_gadget, int value) { struct dummy_hcd *dum_hcd; struct dummy *dum; unsigned long flags; dum = gadget_dev_to_dummy(&_gadget->dev); dum_hcd = gadget_to_dummy_hcd(_gadget); spin_lock_irqsave(&dum->lock, flags); dum->pullup = (value != 0); set_link_state(dum_hcd); if (value == 0) { /* * Emulate synchronize_irq(): wait for callbacks to finish. * This seems to be the best place to emulate the call to * synchronize_irq() that's in usb_gadget_remove_driver(). * Doing it in dummy_udc_stop() would be too late since it * is called after the unbind callback and unbind shouldn't * be invoked until all the other callbacks are finished. */ while (dum->callback_usage > 0) { spin_unlock_irqrestore(&dum->lock, flags); usleep_range(1000, 2000); spin_lock_irqsave(&dum->lock, flags); } } spin_unlock_irqrestore(&dum->lock, flags); usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd)); return 0; } static void dummy_udc_set_speed(struct usb_gadget *_gadget, enum usb_device_speed speed) { struct dummy *dum; dum = gadget_dev_to_dummy(&_gadget->dev); dum->gadget.speed = speed; dummy_udc_update_ep0(dum); } static void dummy_udc_async_callbacks(struct usb_gadget *_gadget, bool enable) { struct dummy *dum = gadget_dev_to_dummy(&_gadget->dev); spin_lock_irq(&dum->lock); dum->ints_enabled = enable; spin_unlock_irq(&dum->lock); } static int dummy_udc_start(struct usb_gadget *g, struct usb_gadget_driver *driver); static int dummy_udc_stop(struct usb_gadget *g); static const struct usb_gadget_ops dummy_ops = { .get_frame = dummy_g_get_frame, .wakeup = dummy_wakeup, .set_selfpowered = dummy_set_selfpowered, .pullup = dummy_pullup, .udc_start = dummy_udc_start, .udc_stop = dummy_udc_stop, .udc_set_speed = dummy_udc_set_speed, .udc_async_callbacks = dummy_udc_async_callbacks, }; /*-------------------------------------------------------------------------*/ /* "function" sysfs attribute */ static ssize_t function_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dummy *dum = gadget_dev_to_dummy(dev); if (!dum->driver || !dum->driver->function) return 0; return scnprintf(buf, PAGE_SIZE, "%s\n", dum->driver->function); } static DEVICE_ATTR_RO(function); /*-------------------------------------------------------------------------*/ /* * Driver registration/unregistration. * * This is basically hardware-specific; there's usually only one real USB * device (not host) controller since that's how USB devices are intended * to work. So most implementations of these api calls will rely on the * fact that only one driver will ever bind to the hardware. But curious * hardware can be built with discrete components, so the gadget API doesn't * require that assumption. * * For this emulator, it might be convenient to create a usb device * for each driver that registers: just add to a big root hub. */ static int dummy_udc_start(struct usb_gadget *g, struct usb_gadget_driver *driver) { struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(g); struct dummy *dum = dum_hcd->dum; switch (g->speed) { /* All the speeds we support */ case USB_SPEED_LOW: case USB_SPEED_FULL: case USB_SPEED_HIGH: case USB_SPEED_SUPER: break; default: dev_err(dummy_dev(dum_hcd), "Unsupported driver max speed %d\n", driver->max_speed); return -EINVAL; } /* * DEVICE side init ... the layer above hardware, which * can't enumerate without help from the driver we're binding. */ spin_lock_irq(&dum->lock); dum->devstatus = 0; dum->driver = driver; spin_unlock_irq(&dum->lock); return 0; } static int dummy_udc_stop(struct usb_gadget *g) { struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(g); struct dummy *dum = dum_hcd->dum; spin_lock_irq(&dum->lock); dum->ints_enabled = 0; stop_activity(dum); dum->driver = NULL; spin_unlock_irq(&dum->lock); return 0; } #undef is_enabled /* The gadget structure is stored inside the hcd structure and will be * released along with it. */ static void init_dummy_udc_hw(struct dummy *dum) { int i; INIT_LIST_HEAD(&dum->gadget.ep_list); for (i = 0; i < DUMMY_ENDPOINTS; i++) { struct dummy_ep *ep = &dum->ep[i]; if (!ep_info[i].name) break; ep->ep.name = ep_info[i].name; ep->ep.caps = ep_info[i].caps; ep->ep.ops = &dummy_ep_ops; list_add_tail(&ep->ep.ep_list, &dum->gadget.ep_list); ep->halted = ep->wedged = ep->already_seen = ep->setup_stage = 0; usb_ep_set_maxpacket_limit(&ep->ep, ~0); ep->ep.max_streams = 16; ep->last_io = jiffies; ep->gadget = &dum->gadget; ep->desc = NULL; INIT_LIST_HEAD(&ep->queue); } dum->gadget.ep0 = &dum->ep[0].ep; list_del_init(&dum->ep[0].ep.ep_list); INIT_LIST_HEAD(&dum->fifo_req.queue); #ifdef CONFIG_USB_OTG dum->gadget.is_otg = 1; #endif } static int dummy_udc_probe(struct platform_device *pdev) { struct dummy *dum; int rc; dum = *((void **)dev_get_platdata(&pdev->dev)); /* Clear usb_gadget region for new registration to udc-core */ memzero_explicit(&dum->gadget, sizeof(struct usb_gadget)); dum->gadget.name = gadget_name; dum->gadget.ops = &dummy_ops; if (mod_data.is_super_speed) dum->gadget.max_speed = USB_SPEED_SUPER; else if (mod_data.is_high_speed) dum->gadget.max_speed = USB_SPEED_HIGH; else dum->gadget.max_speed = USB_SPEED_FULL; dum->gadget.dev.parent = &pdev->dev; init_dummy_udc_hw(dum); rc = usb_add_gadget_udc(&pdev->dev, &dum->gadget); if (rc < 0) goto err_udc; rc = device_create_file(&dum->gadget.dev, &dev_attr_function); if (rc < 0) goto err_dev; platform_set_drvdata(pdev, dum); return rc; err_dev: usb_del_gadget_udc(&dum->gadget); err_udc: return rc; } static void dummy_udc_remove(struct platform_device *pdev) { struct dummy *dum = platform_get_drvdata(pdev); device_remove_file(&dum->gadget.dev, &dev_attr_function); usb_del_gadget_udc(&dum->gadget); } static void dummy_udc_pm(struct dummy *dum, struct dummy_hcd *dum_hcd, int suspend) { spin_lock_irq(&dum->lock); dum->udc_suspended = suspend; set_link_state(dum_hcd); spin_unlock_irq(&dum->lock); } static int dummy_udc_suspend(struct platform_device *pdev, pm_message_t state) { struct dummy *dum = platform_get_drvdata(pdev); struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(&dum->gadget); dev_dbg(&pdev->dev, "%s\n", __func__); dummy_udc_pm(dum, dum_hcd, 1); usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd)); return 0; } static int dummy_udc_resume(struct platform_device *pdev) { struct dummy *dum = platform_get_drvdata(pdev); struct dummy_hcd *dum_hcd = gadget_to_dummy_hcd(&dum->gadget); dev_dbg(&pdev->dev, "%s\n", __func__); dummy_udc_pm(dum, dum_hcd, 0); usb_hcd_poll_rh_status(dummy_hcd_to_hcd(dum_hcd)); return 0; } static struct platform_driver dummy_udc_driver = { .probe = dummy_udc_probe, .remove_new = dummy_udc_remove, .suspend = dummy_udc_suspend, .resume = dummy_udc_resume, .driver = { .name = gadget_name, }, }; /*-------------------------------------------------------------------------*/ static unsigned int dummy_get_ep_idx(const struct usb_endpoint_descriptor *desc) { unsigned int index; index = usb_endpoint_num(desc) << 1; if (usb_endpoint_dir_in(desc)) index |= 1; return index; } /* HOST SIDE DRIVER * * this uses the hcd framework to hook up to host side drivers. * its root hub will only have one device, otherwise it acts like * a normal host controller. * * when urbs are queued, they're just stuck on a list that we * scan in a timer callback. that callback connects writes from * the host with reads from the device, and so on, based on the * usb 2.0 rules. */ static int dummy_ep_stream_en(struct dummy_hcd *dum_hcd, struct urb *urb) { const struct usb_endpoint_descriptor *desc = &urb->ep->desc; u32 index; if (!usb_endpoint_xfer_bulk(desc)) return 0; index = dummy_get_ep_idx(desc); return (1 << index) & dum_hcd->stream_en_ep; } /* * The max stream number is saved as a nibble so for the 30 possible endpoints * we only 15 bytes of memory. Therefore we are limited to max 16 streams (0 * means we use only 1 stream). The maximum according to the spec is 16bit so * if the 16 stream limit is about to go, the array size should be incremented * to 30 elements of type u16. */ static int get_max_streams_for_pipe(struct dummy_hcd *dum_hcd, unsigned int pipe) { int max_streams; max_streams = dum_hcd->num_stream[usb_pipeendpoint(pipe)]; if (usb_pipeout(pipe)) max_streams >>= 4; else max_streams &= 0xf; max_streams++; return max_streams; } static void set_max_streams_for_pipe(struct dummy_hcd *dum_hcd, unsigned int pipe, unsigned int streams) { int max_streams; streams--; max_streams = dum_hcd->num_stream[usb_pipeendpoint(pipe)]; if (usb_pipeout(pipe)) { streams <<= 4; max_streams &= 0xf; } else { max_streams &= 0xf0; } max_streams |= streams; dum_hcd->num_stream[usb_pipeendpoint(pipe)] = max_streams; } static int dummy_validate_stream(struct dummy_hcd *dum_hcd, struct urb *urb) { unsigned int max_streams; int enabled; enabled = dummy_ep_stream_en(dum_hcd, urb); if (!urb->stream_id) { if (enabled) return -EINVAL; return 0; } if (!enabled) return -EINVAL; max_streams = get_max_streams_for_pipe(dum_hcd, usb_pipeendpoint(urb->pipe)); if (urb->stream_id > max_streams) { dev_err(dummy_dev(dum_hcd), "Stream id %d is out of range.\n", urb->stream_id); BUG(); return -EINVAL; } return 0; } static int dummy_urb_enqueue( struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags ) { struct dummy_hcd *dum_hcd; struct urbp *urbp; unsigned long flags; int rc; urbp = kmalloc(sizeof *urbp, mem_flags); if (!urbp) return -ENOMEM; urbp->urb = urb; urbp->miter_started = 0; dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); rc = dummy_validate_stream(dum_hcd, urb); if (rc) { kfree(urbp); goto done; } rc = usb_hcd_link_urb_to_ep(hcd, urb); if (rc) { kfree(urbp); goto done; } if (!dum_hcd->udev) { dum_hcd->udev = urb->dev; usb_get_dev(dum_hcd->udev); } else if (unlikely(dum_hcd->udev != urb->dev)) dev_err(dummy_dev(dum_hcd), "usb_device address has changed!\n"); list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list); urb->hcpriv = urbp; if (!dum_hcd->next_frame_urbp) dum_hcd->next_frame_urbp = urbp; if (usb_pipetype(urb->pipe) == PIPE_CONTROL) urb->error_count = 1; /* mark as a new urb */ /* kick the scheduler, it'll do the rest */ if (!timer_pending(&dum_hcd->timer)) mod_timer(&dum_hcd->timer, jiffies + 1); done: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return rc; } static int dummy_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct dummy_hcd *dum_hcd; unsigned long flags; int rc; /* giveback happens automatically in timer callback, * so make sure the callback happens */ dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); rc = usb_hcd_check_unlink_urb(hcd, urb, status); if (!rc && dum_hcd->rh_state != DUMMY_RH_RUNNING && !list_empty(&dum_hcd->urbp_list)) mod_timer(&dum_hcd->timer, jiffies); spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return rc; } static int dummy_perform_transfer(struct urb *urb, struct dummy_request *req, u32 len) { void *ubuf, *rbuf; struct urbp *urbp = urb->hcpriv; int to_host; struct sg_mapping_iter *miter = &urbp->miter; u32 trans = 0; u32 this_sg; bool next_sg; to_host = usb_urb_dir_in(urb); rbuf = req->req.buf + req->req.actual; if (!urb->num_sgs) { ubuf = urb->transfer_buffer + urb->actual_length; if (to_host) memcpy(ubuf, rbuf, len); else memcpy(rbuf, ubuf, len); return len; } if (!urbp->miter_started) { u32 flags = SG_MITER_ATOMIC; if (to_host) flags |= SG_MITER_TO_SG; else flags |= SG_MITER_FROM_SG; sg_miter_start(miter, urb->sg, urb->num_sgs, flags); urbp->miter_started = 1; } next_sg = sg_miter_next(miter); if (next_sg == false) { WARN_ON_ONCE(1); return -EINVAL; } do { ubuf = miter->addr; this_sg = min_t(u32, len, miter->length); miter->consumed = this_sg; trans += this_sg; if (to_host) memcpy(ubuf, rbuf, this_sg); else memcpy(rbuf, ubuf, this_sg); len -= this_sg; if (!len) break; next_sg = sg_miter_next(miter); if (next_sg == false) { WARN_ON_ONCE(1); return -EINVAL; } rbuf += this_sg; } while (1); sg_miter_stop(miter); return trans; } /* transfer up to a frame's worth; caller must own lock */ static int transfer(struct dummy_hcd *dum_hcd, struct urb *urb, struct dummy_ep *ep, int limit, int *status) { struct dummy *dum = dum_hcd->dum; struct dummy_request *req; int sent = 0; top: /* if there's no request queued, the device is NAKing; return */ list_for_each_entry(req, &ep->queue, queue) { unsigned host_len, dev_len, len; int is_short, to_host; int rescan = 0; if (dummy_ep_stream_en(dum_hcd, urb)) { if ((urb->stream_id != req->req.stream_id)) continue; } /* 1..N packets of ep->ep.maxpacket each ... the last one * may be short (including zero length). * * writer can send a zlp explicitly (length 0) or implicitly * (length mod maxpacket zero, and 'zero' flag); they always * terminate reads. */ host_len = urb->transfer_buffer_length - urb->actual_length; dev_len = req->req.length - req->req.actual; len = min(host_len, dev_len); /* FIXME update emulated data toggle too */ to_host = usb_urb_dir_in(urb); if (unlikely(len == 0)) is_short = 1; else { /* not enough bandwidth left? */ if (limit < ep->ep.maxpacket && limit < len) break; len = min_t(unsigned, len, limit); if (len == 0) break; /* send multiple of maxpacket first, then remainder */ if (len >= ep->ep.maxpacket) { is_short = 0; if (len % ep->ep.maxpacket) rescan = 1; len -= len % ep->ep.maxpacket; } else { is_short = 1; } len = dummy_perform_transfer(urb, req, len); ep->last_io = jiffies; if ((int)len < 0) { req->req.status = len; } else { limit -= len; sent += len; urb->actual_length += len; req->req.actual += len; } } /* short packets terminate, maybe with overflow/underflow. * it's only really an error to write too much. * * partially filling a buffer optionally blocks queue advances * (so completion handlers can clean up the queue) but we don't * need to emulate such data-in-flight. */ if (is_short) { if (host_len == dev_len) { req->req.status = 0; *status = 0; } else if (to_host) { req->req.status = 0; if (dev_len > host_len) *status = -EOVERFLOW; else *status = 0; } else { *status = 0; if (host_len > dev_len) req->req.status = -EOVERFLOW; else req->req.status = 0; } /* * many requests terminate without a short packet. * send a zlp if demanded by flags. */ } else { if (req->req.length == req->req.actual) { if (req->req.zero && to_host) rescan = 1; else req->req.status = 0; } if (urb->transfer_buffer_length == urb->actual_length) { if (urb->transfer_flags & URB_ZERO_PACKET && !to_host) rescan = 1; else *status = 0; } } /* device side completion --> continuable */ if (req->req.status != -EINPROGRESS) { list_del_init(&req->queue); spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); /* requests might have been unlinked... */ rescan = 1; } /* host side completion --> terminate */ if (*status != -EINPROGRESS) break; /* rescan to continue with any other queued i/o */ if (rescan) goto top; } return sent; } static int periodic_bytes(struct dummy *dum, struct dummy_ep *ep) { int limit = ep->ep.maxpacket; if (dum->gadget.speed == USB_SPEED_HIGH) { int tmp; /* high bandwidth mode */ tmp = usb_endpoint_maxp_mult(ep->desc); tmp *= 8 /* applies to entire frame */; limit += limit * tmp; } if (dum->gadget.speed == USB_SPEED_SUPER) { switch (usb_endpoint_type(ep->desc)) { case USB_ENDPOINT_XFER_ISOC: /* Sec. 4.4.8.2 USB3.0 Spec */ limit = 3 * 16 * 1024 * 8; break; case USB_ENDPOINT_XFER_INT: /* Sec. 4.4.7.2 USB3.0 Spec */ limit = 3 * 1024 * 8; break; case USB_ENDPOINT_XFER_BULK: default: break; } } return limit; } #define is_active(dum_hcd) ((dum_hcd->port_status & \ (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE | \ USB_PORT_STAT_SUSPEND)) \ == (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE)) static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address) { int i; if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ? dum->ss_hcd : dum->hs_hcd))) return NULL; if (!dum->ints_enabled) return NULL; if ((address & ~USB_DIR_IN) == 0) return &dum->ep[0]; for (i = 1; i < DUMMY_ENDPOINTS; i++) { struct dummy_ep *ep = &dum->ep[i]; if (!ep->desc) continue; if (ep->desc->bEndpointAddress == address) return ep; } return NULL; } #undef is_active #define Dev_Request (USB_TYPE_STANDARD | USB_RECIP_DEVICE) #define Dev_InRequest (Dev_Request | USB_DIR_IN) #define Intf_Request (USB_TYPE_STANDARD | USB_RECIP_INTERFACE) #define Intf_InRequest (Intf_Request | USB_DIR_IN) #define Ep_Request (USB_TYPE_STANDARD | USB_RECIP_ENDPOINT) #define Ep_InRequest (Ep_Request | USB_DIR_IN) /** * handle_control_request() - handles all control transfers * @dum_hcd: pointer to dummy (the_controller) * @urb: the urb request to handle * @setup: pointer to the setup data for a USB device control * request * @status: pointer to request handling status * * Return 0 - if the request was handled * 1 - if the request wasn't handles * error code on error */ static int handle_control_request(struct dummy_hcd *dum_hcd, struct urb *urb, struct usb_ctrlrequest *setup, int *status) { struct dummy_ep *ep2; struct dummy *dum = dum_hcd->dum; int ret_val = 1; unsigned w_index; unsigned w_value; w_index = le16_to_cpu(setup->wIndex); w_value = le16_to_cpu(setup->wValue); switch (setup->bRequest) { case USB_REQ_SET_ADDRESS: if (setup->bRequestType != Dev_Request) break; dum->address = w_value; *status = 0; dev_dbg(udc_dev(dum), "set_address = %d\n", w_value); ret_val = 0; break; case USB_REQ_SET_FEATURE: if (setup->bRequestType == Dev_Request) { ret_val = 0; switch (w_value) { case USB_DEVICE_REMOTE_WAKEUP: break; case USB_DEVICE_B_HNP_ENABLE: dum->gadget.b_hnp_enable = 1; break; case USB_DEVICE_A_HNP_SUPPORT: dum->gadget.a_hnp_support = 1; break; case USB_DEVICE_A_ALT_HNP_SUPPORT: dum->gadget.a_alt_hnp_support = 1; break; case USB_DEVICE_U1_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U1_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_U2_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U2_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_LTM_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_LTM_ENABLED; else ret_val = -EOPNOTSUPP; break; default: ret_val = -EOPNOTSUPP; } if (ret_val == 0) { dum->devstatus |= (1 << w_value); *status = 0; } } else if (setup->bRequestType == Ep_Request) { /* endpoint halt */ ep2 = find_endpoint(dum, w_index); if (!ep2 || ep2->ep.name == ep0name) { ret_val = -EOPNOTSUPP; break; } ep2->halted = 1; ret_val = 0; *status = 0; } break; case USB_REQ_CLEAR_FEATURE: if (setup->bRequestType == Dev_Request) { ret_val = 0; switch (w_value) { case USB_DEVICE_REMOTE_WAKEUP: w_value = USB_DEVICE_REMOTE_WAKEUP; break; case USB_DEVICE_U1_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U1_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_U2_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_U2_ENABLED; else ret_val = -EOPNOTSUPP; break; case USB_DEVICE_LTM_ENABLE: if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) w_value = USB_DEV_STAT_LTM_ENABLED; else ret_val = -EOPNOTSUPP; break; default: ret_val = -EOPNOTSUPP; break; } if (ret_val == 0) { dum->devstatus &= ~(1 << w_value); *status = 0; } } else if (setup->bRequestType == Ep_Request) { /* endpoint halt */ ep2 = find_endpoint(dum, w_index); if (!ep2) { ret_val = -EOPNOTSUPP; break; } if (!ep2->wedged) ep2->halted = 0; ret_val = 0; *status = 0; } break; case USB_REQ_GET_STATUS: if (setup->bRequestType == Dev_InRequest || setup->bRequestType == Intf_InRequest || setup->bRequestType == Ep_InRequest) { char *buf; /* * device: remote wakeup, selfpowered * interface: nothing * endpoint: halt */ buf = (char *)urb->transfer_buffer; if (urb->transfer_buffer_length > 0) { if (setup->bRequestType == Ep_InRequest) { ep2 = find_endpoint(dum, w_index); if (!ep2) { ret_val = -EOPNOTSUPP; break; } buf[0] = ep2->halted; } else if (setup->bRequestType == Dev_InRequest) { buf[0] = (u8)dum->devstatus; } else buf[0] = 0; } if (urb->transfer_buffer_length > 1) buf[1] = 0; urb->actual_length = min_t(u32, 2, urb->transfer_buffer_length); ret_val = 0; *status = 0; } break; } return ret_val; } /* * Drive both sides of the transfers; looks like irq handlers to both * drivers except that the callbacks are invoked from soft interrupt * context. */ static void dummy_timer(struct timer_list *t) { struct dummy_hcd *dum_hcd = from_timer(dum_hcd, t, timer); struct dummy *dum = dum_hcd->dum; struct urbp *urbp, *tmp; unsigned long flags; int limit, total; int i; /* simplistic model for one frame's bandwidth */ /* FIXME: account for transaction and packet overhead */ switch (dum->gadget.speed) { case USB_SPEED_LOW: total = 8/*bytes*/ * 12/*packets*/; break; case USB_SPEED_FULL: total = 64/*bytes*/ * 19/*packets*/; break; case USB_SPEED_HIGH: total = 512/*bytes*/ * 13/*packets*/ * 8/*uframes*/; break; case USB_SPEED_SUPER: /* Bus speed is 500000 bytes/ms, so use a little less */ total = 490000; break; default: /* Can't happen */ dev_err(dummy_dev(dum_hcd), "bogus device speed\n"); total = 0; break; } /* FIXME if HZ != 1000 this will probably misbehave ... */ /* look at each urb queued by the host side driver */ spin_lock_irqsave(&dum->lock, flags); if (!dum_hcd->udev) { dev_err(dummy_dev(dum_hcd), "timer fired with no URBs pending?\n"); spin_unlock_irqrestore(&dum->lock, flags); return; } dum_hcd->next_frame_urbp = NULL; for (i = 0; i < DUMMY_ENDPOINTS; i++) { if (!ep_info[i].name) break; dum->ep[i].already_seen = 0; } restart: list_for_each_entry_safe(urbp, tmp, &dum_hcd->urbp_list, urbp_list) { struct urb *urb; struct dummy_request *req; u8 address; struct dummy_ep *ep = NULL; int status = -EINPROGRESS; /* stop when we reach URBs queued after the timer interrupt */ if (urbp == dum_hcd->next_frame_urbp) break; urb = urbp->urb; if (urb->unlinked) goto return_urb; else if (dum_hcd->rh_state != DUMMY_RH_RUNNING) continue; /* Used up this frame's bandwidth? */ if (total <= 0) continue; /* find the gadget's ep for this request (if configured) */ address = usb_pipeendpoint (urb->pipe); if (usb_urb_dir_in(urb)) address |= USB_DIR_IN; ep = find_endpoint(dum, address); if (!ep) { /* set_configuration() disagreement */ dev_dbg(dummy_dev(dum_hcd), "no ep configured for urb %p\n", urb); status = -EPROTO; goto return_urb; } if (ep->already_seen) continue; ep->already_seen = 1; if (ep == &dum->ep[0] && urb->error_count) { ep->setup_stage = 1; /* a new urb */ urb->error_count = 0; } if (ep->halted && !ep->setup_stage) { /* NOTE: must not be iso! */ dev_dbg(dummy_dev(dum_hcd), "ep %s halted, urb %p\n", ep->ep.name, urb); status = -EPIPE; goto return_urb; } /* FIXME make sure both ends agree on maxpacket */ /* handle control requests */ if (ep == &dum->ep[0] && ep->setup_stage) { struct usb_ctrlrequest setup; int value; setup = *(struct usb_ctrlrequest *) urb->setup_packet; /* paranoia, in case of stale queued data */ list_for_each_entry(req, &ep->queue, queue) { list_del_init(&req->queue); req->req.status = -EOVERFLOW; dev_dbg(udc_dev(dum), "stale req = %p\n", req); spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); ep->already_seen = 0; goto restart; } /* gadget driver never sees set_address or operations * on standard feature flags. some hardware doesn't * even expose them. */ ep->last_io = jiffies; ep->setup_stage = 0; ep->halted = 0; value = handle_control_request(dum_hcd, urb, &setup, &status); /* gadget driver handles all other requests. block * until setup() returns; no reentrancy issues etc. */ if (value > 0) { ++dum->callback_usage; spin_unlock(&dum->lock); value = dum->driver->setup(&dum->gadget, &setup); spin_lock(&dum->lock); --dum->callback_usage; if (value >= 0) { /* no delays (max 64KB data stage) */ limit = 64*1024; goto treat_control_like_bulk; } /* error, see below */ } if (value < 0) { if (value != -EOPNOTSUPP) dev_dbg(udc_dev(dum), "setup --> %d\n", value); status = -EPIPE; urb->actual_length = 0; } goto return_urb; } /* non-control requests */ limit = total; switch (usb_pipetype(urb->pipe)) { case PIPE_ISOCHRONOUS: /* * We don't support isochronous. But if we did, * here are some of the issues we'd have to face: * * Is it urb->interval since the last xfer? * Use urb->iso_frame_desc[i]. * Complete whether or not ep has requests queued. * Report random errors, to debug drivers. */ limit = max(limit, periodic_bytes(dum, ep)); status = -EINVAL; /* fail all xfers */ break; case PIPE_INTERRUPT: /* FIXME is it urb->interval since the last xfer? * this almost certainly polls too fast. */ limit = max(limit, periodic_bytes(dum, ep)); fallthrough; default: treat_control_like_bulk: ep->last_io = jiffies; total -= transfer(dum_hcd, urb, ep, limit, &status); break; } /* incomplete transfer? */ if (status == -EINPROGRESS) continue; return_urb: list_del(&urbp->urbp_list); kfree(urbp); if (ep) ep->already_seen = ep->setup_stage = 0; usb_hcd_unlink_urb_from_ep(dummy_hcd_to_hcd(dum_hcd), urb); spin_unlock(&dum->lock); usb_hcd_giveback_urb(dummy_hcd_to_hcd(dum_hcd), urb, status); spin_lock(&dum->lock); goto restart; } if (list_empty(&dum_hcd->urbp_list)) { usb_put_dev(dum_hcd->udev); dum_hcd->udev = NULL; } else if (dum_hcd->rh_state == DUMMY_RH_RUNNING) { /* want a 1 msec delay here */ mod_timer(&dum_hcd->timer, jiffies + msecs_to_jiffies(1)); } spin_unlock_irqrestore(&dum->lock, flags); } /*-------------------------------------------------------------------------*/ #define PORT_C_MASK \ ((USB_PORT_STAT_C_CONNECTION \ | USB_PORT_STAT_C_ENABLE \ | USB_PORT_STAT_C_SUSPEND \ | USB_PORT_STAT_C_OVERCURRENT \ | USB_PORT_STAT_C_RESET) << 16) static int dummy_hub_status(struct usb_hcd *hcd, char *buf) { struct dummy_hcd *dum_hcd; unsigned long flags; int retval = 0; dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) goto done; if (dum_hcd->resuming && time_after_eq(jiffies, dum_hcd->re_timeout)) { dum_hcd->port_status |= (USB_PORT_STAT_C_SUSPEND << 16); dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; set_link_state(dum_hcd); } if ((dum_hcd->port_status & PORT_C_MASK) != 0) { *buf = (1 << 1); dev_dbg(dummy_dev(dum_hcd), "port status 0x%08x has changes\n", dum_hcd->port_status); retval = 1; if (dum_hcd->rh_state == DUMMY_RH_SUSPENDED) usb_hcd_resume_root_hub(hcd); } done: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return retval; } /* usb 3.0 root hub device descriptor */ static struct { struct usb_bos_descriptor bos; struct usb_ss_cap_descriptor ss_cap; } __packed usb3_bos_desc = { .bos = { .bLength = USB_DT_BOS_SIZE, .bDescriptorType = USB_DT_BOS, .wTotalLength = cpu_to_le16(sizeof(usb3_bos_desc)), .bNumDeviceCaps = 1, }, .ss_cap = { .bLength = USB_DT_USB_SS_CAP_SIZE, .bDescriptorType = USB_DT_DEVICE_CAPABILITY, .bDevCapabilityType = USB_SS_CAP_TYPE, .wSpeedSupported = cpu_to_le16(USB_5GBPS_OPERATION), .bFunctionalitySupport = ilog2(USB_5GBPS_OPERATION), }, }; static inline void ss_hub_descriptor(struct usb_hub_descriptor *desc) { memset(desc, 0, sizeof *desc); desc->bDescriptorType = USB_DT_SS_HUB; desc->bDescLength = 12; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = 1; desc->u.ss.bHubHdrDecLat = 0x04; /* Worst case: 0.4 micro sec*/ desc->u.ss.DeviceRemovable = 0; } static inline void hub_descriptor(struct usb_hub_descriptor *desc) { memset(desc, 0, sizeof *desc); desc->bDescriptorType = USB_DT_HUB; desc->bDescLength = 9; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = 1; desc->u.hs.DeviceRemovable[0] = 0; desc->u.hs.DeviceRemovable[1] = 0xff; /* PortPwrCtrlMask */ } static int dummy_hub_control( struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength ) { struct dummy_hcd *dum_hcd; int retval = 0; unsigned long flags; if (!HCD_HW_ACCESSIBLE(hcd)) return -ETIMEDOUT; dum_hcd = hcd_to_dummy_hcd(hcd); spin_lock_irqsave(&dum_hcd->dum->lock, flags); switch (typeReq) { case ClearHubFeature: break; case ClearPortFeature: switch (wValue) { case USB_PORT_FEAT_SUSPEND: if (hcd->speed == HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_SUSPEND req not " "supported for USB 3.0 roothub\n"); goto error; } if (dum_hcd->port_status & USB_PORT_STAT_SUSPEND) { /* 20msec resume signaling */ dum_hcd->resuming = 1; dum_hcd->re_timeout = jiffies + msecs_to_jiffies(20); } break; case USB_PORT_FEAT_POWER: dev_dbg(dummy_dev(dum_hcd), "power-off\n"); if (hcd->speed == HCD_USB3) dum_hcd->port_status &= ~USB_SS_PORT_STAT_POWER; else dum_hcd->port_status &= ~USB_PORT_STAT_POWER; set_link_state(dum_hcd); break; case USB_PORT_FEAT_ENABLE: case USB_PORT_FEAT_C_ENABLE: case USB_PORT_FEAT_C_SUSPEND: /* Not allowed for USB-3 */ if (hcd->speed == HCD_USB3) goto error; fallthrough; case USB_PORT_FEAT_C_CONNECTION: case USB_PORT_FEAT_C_RESET: dum_hcd->port_status &= ~(1 << wValue); set_link_state(dum_hcd); break; default: /* Disallow INDICATOR and C_OVER_CURRENT */ goto error; } break; case GetHubDescriptor: if (hcd->speed == HCD_USB3 && (wLength < USB_DT_SS_HUB_SIZE || wValue != (USB_DT_SS_HUB << 8))) { dev_dbg(dummy_dev(dum_hcd), "Wrong hub descriptor type for " "USB 3.0 roothub.\n"); goto error; } if (hcd->speed == HCD_USB3) ss_hub_descriptor((struct usb_hub_descriptor *) buf); else hub_descriptor((struct usb_hub_descriptor *) buf); break; case DeviceRequest | USB_REQ_GET_DESCRIPTOR: if (hcd->speed != HCD_USB3) goto error; if ((wValue >> 8) != USB_DT_BOS) goto error; memcpy(buf, &usb3_bos_desc, sizeof(usb3_bos_desc)); retval = sizeof(usb3_bos_desc); break; case GetHubStatus: *(__le32 *) buf = cpu_to_le32(0); break; case GetPortStatus: if (wIndex != 1) retval = -EPIPE; /* whoever resets or resumes must GetPortStatus to * complete it!! */ if (dum_hcd->resuming && time_after_eq(jiffies, dum_hcd->re_timeout)) { dum_hcd->port_status |= (USB_PORT_STAT_C_SUSPEND << 16); dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; } if ((dum_hcd->port_status & USB_PORT_STAT_RESET) != 0 && time_after_eq(jiffies, dum_hcd->re_timeout)) { dum_hcd->port_status |= (USB_PORT_STAT_C_RESET << 16); dum_hcd->port_status &= ~USB_PORT_STAT_RESET; if (dum_hcd->dum->pullup) { dum_hcd->port_status |= USB_PORT_STAT_ENABLE; if (hcd->speed < HCD_USB3) { switch (dum_hcd->dum->gadget.speed) { case USB_SPEED_HIGH: dum_hcd->port_status |= USB_PORT_STAT_HIGH_SPEED; break; case USB_SPEED_LOW: dum_hcd->dum->gadget.ep0-> maxpacket = 8; dum_hcd->port_status |= USB_PORT_STAT_LOW_SPEED; break; default: break; } } } } set_link_state(dum_hcd); ((__le16 *) buf)[0] = cpu_to_le16(dum_hcd->port_status); ((__le16 *) buf)[1] = cpu_to_le16(dum_hcd->port_status >> 16); break; case SetHubFeature: retval = -EPIPE; break; case SetPortFeature: switch (wValue) { case USB_PORT_FEAT_LINK_STATE: if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_LINK_STATE req not " "supported for USB 2.0 roothub\n"); goto error; } /* * Since this is dummy we don't have an actual link so * there is nothing to do for the SET_LINK_STATE cmd */ break; case USB_PORT_FEAT_U1_TIMEOUT: case USB_PORT_FEAT_U2_TIMEOUT: /* TODO: add suspend/resume support! */ if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_U1/2_TIMEOUT req not " "supported for USB 2.0 roothub\n"); goto error; } break; case USB_PORT_FEAT_SUSPEND: /* Applicable only for USB2.0 hub */ if (hcd->speed == HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_SUSPEND req not " "supported for USB 3.0 roothub\n"); goto error; } if (dum_hcd->active) { dum_hcd->port_status |= USB_PORT_STAT_SUSPEND; /* HNP would happen here; for now we * assume b_bus_req is always true. */ set_link_state(dum_hcd); if (((1 << USB_DEVICE_B_HNP_ENABLE) & dum_hcd->dum->devstatus) != 0) dev_dbg(dummy_dev(dum_hcd), "no HNP yet!\n"); } break; case USB_PORT_FEAT_POWER: if (hcd->speed == HCD_USB3) dum_hcd->port_status |= USB_SS_PORT_STAT_POWER; else dum_hcd->port_status |= USB_PORT_STAT_POWER; set_link_state(dum_hcd); break; case USB_PORT_FEAT_BH_PORT_RESET: /* Applicable only for USB3.0 hub */ if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "USB_PORT_FEAT_BH_PORT_RESET req not " "supported for USB 2.0 roothub\n"); goto error; } fallthrough; case USB_PORT_FEAT_RESET: if (!(dum_hcd->port_status & USB_PORT_STAT_CONNECTION)) break; /* if it's already enabled, disable */ if (hcd->speed == HCD_USB3) { dum_hcd->port_status = (USB_SS_PORT_STAT_POWER | USB_PORT_STAT_CONNECTION | USB_PORT_STAT_RESET); } else { dum_hcd->port_status &= ~(USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED); dum_hcd->port_status |= USB_PORT_STAT_RESET; } /* * We want to reset device status. All but the * Self powered feature */ dum_hcd->dum->devstatus &= (1 << USB_DEVICE_SELF_POWERED); /* * FIXME USB3.0: what is the correct reset signaling * interval? Is it still 50msec as for HS? */ dum_hcd->re_timeout = jiffies + msecs_to_jiffies(50); set_link_state(dum_hcd); break; case USB_PORT_FEAT_C_CONNECTION: case USB_PORT_FEAT_C_RESET: case USB_PORT_FEAT_C_ENABLE: case USB_PORT_FEAT_C_SUSPEND: /* Not allowed for USB-3, and ignored for USB-2 */ if (hcd->speed == HCD_USB3) goto error; break; default: /* Disallow TEST, INDICATOR, and C_OVER_CURRENT */ goto error; } break; case GetPortErrorCount: if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "GetPortErrorCount req not " "supported for USB 2.0 roothub\n"); goto error; } /* We'll always return 0 since this is a dummy hub */ *(__le32 *) buf = cpu_to_le32(0); break; case SetHubDepth: if (hcd->speed != HCD_USB3) { dev_dbg(dummy_dev(dum_hcd), "SetHubDepth req not supported for " "USB 2.0 roothub\n"); goto error; } break; default: dev_dbg(dummy_dev(dum_hcd), "hub control req%04x v%04x i%04x l%d\n", typeReq, wValue, wIndex, wLength); error: /* "protocol stall" on error */ retval = -EPIPE; } spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); if ((dum_hcd->port_status & PORT_C_MASK) != 0) usb_hcd_poll_rh_status(hcd); return retval; } static int dummy_bus_suspend(struct usb_hcd *hcd) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irq(&dum_hcd->dum->lock); dum_hcd->rh_state = DUMMY_RH_SUSPENDED; set_link_state(dum_hcd); hcd->state = HC_STATE_SUSPENDED; spin_unlock_irq(&dum_hcd->dum->lock); return 0; } static int dummy_bus_resume(struct usb_hcd *hcd) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); int rc = 0; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irq(&dum_hcd->dum->lock); if (!HCD_HW_ACCESSIBLE(hcd)) { rc = -ESHUTDOWN; } else { dum_hcd->rh_state = DUMMY_RH_RUNNING; set_link_state(dum_hcd); if (!list_empty(&dum_hcd->urbp_list)) mod_timer(&dum_hcd->timer, jiffies); hcd->state = HC_STATE_RUNNING; } spin_unlock_irq(&dum_hcd->dum->lock); return rc; } /*-------------------------------------------------------------------------*/ static inline ssize_t show_urb(char *buf, size_t size, struct urb *urb) { int ep = usb_pipeendpoint(urb->pipe); return scnprintf(buf, size, "urb/%p %s ep%d%s%s len %d/%d\n", urb, ({ char *s; switch (urb->dev->speed) { case USB_SPEED_LOW: s = "ls"; break; case USB_SPEED_FULL: s = "fs"; break; case USB_SPEED_HIGH: s = "hs"; break; case USB_SPEED_SUPER: s = "ss"; break; default: s = "?"; break; } s; }), ep, ep ? (usb_urb_dir_in(urb) ? "in" : "out") : "", ({ char *s; \ switch (usb_pipetype(urb->pipe)) { \ case PIPE_CONTROL: \ s = ""; \ break; \ case PIPE_BULK: \ s = "-bulk"; \ break; \ case PIPE_INTERRUPT: \ s = "-int"; \ break; \ default: \ s = "-iso"; \ break; \ } s; }), urb->actual_length, urb->transfer_buffer_length); } static ssize_t urbs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_hcd *hcd = dev_get_drvdata(dev); struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); struct urbp *urbp; size_t size = 0; unsigned long flags; spin_lock_irqsave(&dum_hcd->dum->lock, flags); list_for_each_entry(urbp, &dum_hcd->urbp_list, urbp_list) { size_t temp; temp = show_urb(buf, PAGE_SIZE - size, urbp->urb); buf += temp; size += temp; } spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return size; } static DEVICE_ATTR_RO(urbs); static int dummy_start_ss(struct dummy_hcd *dum_hcd) { timer_setup(&dum_hcd->timer, dummy_timer, 0); dum_hcd->rh_state = DUMMY_RH_RUNNING; dum_hcd->stream_en_ep = 0; INIT_LIST_HEAD(&dum_hcd->urbp_list); dummy_hcd_to_hcd(dum_hcd)->power_budget = POWER_BUDGET_3; dummy_hcd_to_hcd(dum_hcd)->state = HC_STATE_RUNNING; dummy_hcd_to_hcd(dum_hcd)->uses_new_polling = 1; #ifdef CONFIG_USB_OTG dummy_hcd_to_hcd(dum_hcd)->self.otg_port = 1; #endif return 0; /* FIXME 'urbs' should be a per-device thing, maybe in usbcore */ return device_create_file(dummy_dev(dum_hcd), &dev_attr_urbs); } static int dummy_start(struct usb_hcd *hcd) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); /* * HOST side init ... we emulate a root hub that'll only ever * talk to one device (the gadget side). Also appears in sysfs, * just like more familiar pci-based HCDs. */ if (!usb_hcd_is_primary_hcd(hcd)) return dummy_start_ss(dum_hcd); spin_lock_init(&dum_hcd->dum->lock); timer_setup(&dum_hcd->timer, dummy_timer, 0); dum_hcd->rh_state = DUMMY_RH_RUNNING; INIT_LIST_HEAD(&dum_hcd->urbp_list); hcd->power_budget = POWER_BUDGET; hcd->state = HC_STATE_RUNNING; hcd->uses_new_polling = 1; #ifdef CONFIG_USB_OTG hcd->self.otg_port = 1; #endif /* FIXME 'urbs' should be a per-device thing, maybe in usbcore */ return device_create_file(dummy_dev(dum_hcd), &dev_attr_urbs); } static void dummy_stop(struct usb_hcd *hcd) { device_remove_file(dummy_dev(hcd_to_dummy_hcd(hcd)), &dev_attr_urbs); dev_info(dummy_dev(hcd_to_dummy_hcd(hcd)), "stopped\n"); } /*-------------------------------------------------------------------------*/ static int dummy_h_get_frame(struct usb_hcd *hcd) { return dummy_g_get_frame(NULL); } static int dummy_setup(struct usb_hcd *hcd) { struct dummy *dum; dum = *((void **)dev_get_platdata(hcd->self.controller)); hcd->self.sg_tablesize = ~0; if (usb_hcd_is_primary_hcd(hcd)) { dum->hs_hcd = hcd_to_dummy_hcd(hcd); dum->hs_hcd->dum = dum; /* * Mark the first roothub as being USB 2.0. * The USB 3.0 roothub will be registered later by * dummy_hcd_probe() */ hcd->speed = HCD_USB2; hcd->self.root_hub->speed = USB_SPEED_HIGH; } else { dum->ss_hcd = hcd_to_dummy_hcd(hcd); dum->ss_hcd->dum = dum; hcd->speed = HCD_USB3; hcd->self.root_hub->speed = USB_SPEED_SUPER; } return 0; } /* Change a group of bulk endpoints to support multiple stream IDs */ static int dummy_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, unsigned int num_streams, gfp_t mem_flags) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); unsigned long flags; int max_stream; int ret_streams = num_streams; unsigned int index; unsigned int i; if (!num_eps) return -EINVAL; spin_lock_irqsave(&dum_hcd->dum->lock, flags); for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); if ((1 << index) & dum_hcd->stream_en_ep) { ret_streams = -EINVAL; goto out; } max_stream = usb_ss_max_streams(&eps[i]->ss_ep_comp); if (!max_stream) { ret_streams = -EINVAL; goto out; } if (max_stream < ret_streams) { dev_dbg(dummy_dev(dum_hcd), "Ep 0x%x only supports %u " "stream IDs.\n", eps[i]->desc.bEndpointAddress, max_stream); ret_streams = max_stream; } } for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); dum_hcd->stream_en_ep |= 1 << index; set_max_streams_for_pipe(dum_hcd, usb_endpoint_num(&eps[i]->desc), ret_streams); } out: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return ret_streams; } /* Reverts a group of bulk endpoints back to not using stream IDs. */ static int dummy_free_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, gfp_t mem_flags) { struct dummy_hcd *dum_hcd = hcd_to_dummy_hcd(hcd); unsigned long flags; int ret; unsigned int index; unsigned int i; spin_lock_irqsave(&dum_hcd->dum->lock, flags); for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); if (!((1 << index) & dum_hcd->stream_en_ep)) { ret = -EINVAL; goto out; } } for (i = 0; i < num_eps; i++) { index = dummy_get_ep_idx(&eps[i]->desc); dum_hcd->stream_en_ep &= ~(1 << index); set_max_streams_for_pipe(dum_hcd, usb_endpoint_num(&eps[i]->desc), 0); } ret = 0; out: spin_unlock_irqrestore(&dum_hcd->dum->lock, flags); return ret; } static struct hc_driver dummy_hcd = { .description = (char *) driver_name, .product_desc = "Dummy host controller", .hcd_priv_size = sizeof(struct dummy_hcd), .reset = dummy_setup, .start = dummy_start, .stop = dummy_stop, .urb_enqueue = dummy_urb_enqueue, .urb_dequeue = dummy_urb_dequeue, .get_frame_number = dummy_h_get_frame, .hub_status_data = dummy_hub_status, .hub_control = dummy_hub_control, .bus_suspend = dummy_bus_suspend, .bus_resume = dummy_bus_resume, .alloc_streams = dummy_alloc_streams, .free_streams = dummy_free_streams, }; static int dummy_hcd_probe(struct platform_device *pdev) { struct dummy *dum; struct usb_hcd *hs_hcd; struct usb_hcd *ss_hcd; int retval; dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc); dum = *((void **)dev_get_platdata(&pdev->dev)); if (mod_data.is_super_speed) dummy_hcd.flags = HCD_USB3 | HCD_SHARED; else if (mod_data.is_high_speed) dummy_hcd.flags = HCD_USB2; else dummy_hcd.flags = HCD_USB11; hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev)); if (!hs_hcd) return -ENOMEM; hs_hcd->has_tt = 1; retval = usb_add_hcd(hs_hcd, 0, 0); if (retval) goto put_usb2_hcd; if (mod_data.is_super_speed) { ss_hcd = usb_create_shared_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev), hs_hcd); if (!ss_hcd) { retval = -ENOMEM; goto dealloc_usb2_hcd; } retval = usb_add_hcd(ss_hcd, 0, 0); if (retval) goto put_usb3_hcd; } return 0; put_usb3_hcd: usb_put_hcd(ss_hcd); dealloc_usb2_hcd: usb_remove_hcd(hs_hcd); put_usb2_hcd: usb_put_hcd(hs_hcd); dum->hs_hcd = dum->ss_hcd = NULL; return retval; } static void dummy_hcd_remove(struct platform_device *pdev) { struct dummy *dum; dum = hcd_to_dummy_hcd(platform_get_drvdata(pdev))->dum; if (dum->ss_hcd) { usb_remove_hcd(dummy_hcd_to_hcd(dum->ss_hcd)); usb_put_hcd(dummy_hcd_to_hcd(dum->ss_hcd)); } usb_remove_hcd(dummy_hcd_to_hcd(dum->hs_hcd)); usb_put_hcd(dummy_hcd_to_hcd(dum->hs_hcd)); dum->hs_hcd = NULL; dum->ss_hcd = NULL; } static int dummy_hcd_suspend(struct platform_device *pdev, pm_message_t state) { struct usb_hcd *hcd; struct dummy_hcd *dum_hcd; int rc = 0; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); dum_hcd = hcd_to_dummy_hcd(hcd); if (dum_hcd->rh_state == DUMMY_RH_RUNNING) { dev_warn(&pdev->dev, "Root hub isn't suspended!\n"); rc = -EBUSY; } else clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); return rc; } static int dummy_hcd_resume(struct platform_device *pdev) { struct usb_hcd *hcd; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); usb_hcd_poll_rh_status(hcd); return 0; } static struct platform_driver dummy_hcd_driver = { .probe = dummy_hcd_probe, .remove_new = dummy_hcd_remove, .suspend = dummy_hcd_suspend, .resume = dummy_hcd_resume, .driver = { .name = driver_name, }, }; /*-------------------------------------------------------------------------*/ #define MAX_NUM_UDC 32 static struct platform_device *the_udc_pdev[MAX_NUM_UDC]; static struct platform_device *the_hcd_pdev[MAX_NUM_UDC]; static int __init dummy_hcd_init(void) { int retval = -ENOMEM; int i; struct dummy *dum[MAX_NUM_UDC] = {}; if (usb_disabled()) return -ENODEV; if (!mod_data.is_high_speed && mod_data.is_super_speed) return -EINVAL; if (mod_data.num < 1 || mod_data.num > MAX_NUM_UDC) { pr_err("Number of emulated UDC must be in range of 1...%d\n", MAX_NUM_UDC); return -EINVAL; } for (i = 0; i < mod_data.num; i++) { the_hcd_pdev[i] = platform_device_alloc(driver_name, i); if (!the_hcd_pdev[i]) { i--; while (i >= 0) platform_device_put(the_hcd_pdev[i--]); return retval; } } for (i = 0; i < mod_data.num; i++) { the_udc_pdev[i] = platform_device_alloc(gadget_name, i); if (!the_udc_pdev[i]) { i--; while (i >= 0) platform_device_put(the_udc_pdev[i--]); goto err_alloc_udc; } } for (i = 0; i < mod_data.num; i++) { dum[i] = kzalloc(sizeof(struct dummy), GFP_KERNEL); if (!dum[i]) { retval = -ENOMEM; goto err_add_pdata; } retval = platform_device_add_data(the_hcd_pdev[i], &dum[i], sizeof(void *)); if (retval) goto err_add_pdata; retval = platform_device_add_data(the_udc_pdev[i], &dum[i], sizeof(void *)); if (retval) goto err_add_pdata; } retval = platform_driver_register(&dummy_hcd_driver); if (retval < 0) goto err_add_pdata; retval = platform_driver_register(&dummy_udc_driver); if (retval < 0) goto err_register_udc_driver; for (i = 0; i < mod_data.num; i++) { retval = platform_device_add(the_hcd_pdev[i]); if (retval < 0) { i--; while (i >= 0) platform_device_del(the_hcd_pdev[i--]); goto err_add_hcd; } } for (i = 0; i < mod_data.num; i++) { if (!dum[i]->hs_hcd || (!dum[i]->ss_hcd && mod_data.is_super_speed)) { /* * The hcd was added successfully but its probe * function failed for some reason. */ retval = -EINVAL; goto err_add_udc; } } for (i = 0; i < mod_data.num; i++) { retval = platform_device_add(the_udc_pdev[i]); if (retval < 0) { i--; while (i >= 0) platform_device_del(the_udc_pdev[i--]); goto err_add_udc; } } for (i = 0; i < mod_data.num; i++) { if (!platform_get_drvdata(the_udc_pdev[i])) { /* * The udc was added successfully but its probe * function failed for some reason. */ retval = -EINVAL; goto err_probe_udc; } } return retval; err_probe_udc: for (i = 0; i < mod_data.num; i++) platform_device_del(the_udc_pdev[i]); err_add_udc: for (i = 0; i < mod_data.num; i++) platform_device_del(the_hcd_pdev[i]); err_add_hcd: platform_driver_unregister(&dummy_udc_driver); err_register_udc_driver: platform_driver_unregister(&dummy_hcd_driver); err_add_pdata: for (i = 0; i < mod_data.num; i++) kfree(dum[i]); for (i = 0; i < mod_data.num; i++) platform_device_put(the_udc_pdev[i]); err_alloc_udc: for (i = 0; i < mod_data.num; i++) platform_device_put(the_hcd_pdev[i]); return retval; } module_init(dummy_hcd_init); static void __exit dummy_hcd_cleanup(void) { int i; for (i = 0; i < mod_data.num; i++) { struct dummy *dum; dum = *((void **)dev_get_platdata(&the_udc_pdev[i]->dev)); platform_device_unregister(the_udc_pdev[i]); platform_device_unregister(the_hcd_pdev[i]); kfree(dum); } platform_driver_unregister(&dummy_udc_driver); platform_driver_unregister(&dummy_hcd_driver); } module_exit(dummy_hcd_cleanup); |
| 145 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 /* prefix flags */ #define IF_PREFIX_ONLINK 0x01 #define IF_PREFIX_AUTOCONF 0x02 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; /* * Used to safely traverse idev->addr_list in process context * if the idev->lock needed to protect idev->addr_list cannot be held. * In that case, add the items to this list temporarily and iterate * without holding idev->lock. * See addrconf_ifdown and dev_forward_change. */ struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; u8 ifa_proto; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; struct in6_addr sl_addr[]; }; #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; netdevice_tracker dev_tracker; struct list_head addr_list; struct ifmcaddr6 __rcu *mc_list; struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ struct delayed_work mc_query_work; /* mld query work */ struct delayed_work mc_report_work; /* mld report work */ struct sk_buff_head mc_query_queue; /* mld query queue */ struct sk_buff_head mc_report_queue; /* mld report queue */ spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif |
| 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 | // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the Diolan DLN-2 USB adapter * * Copyright (c) 2014 Intel Corporation * * Derived from: * i2c-diolan-u2c.c * Copyright (c) 2010-2011 Ericsson AB */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/mutex.h> #include <linux/platform_device.h> #include <linux/mfd/core.h> #include <linux/mfd/dln2.h> #include <linux/rculist.h> struct dln2_header { __le16 size; __le16 id; __le16 echo; __le16 handle; }; struct dln2_response { struct dln2_header hdr; __le16 result; }; #define DLN2_GENERIC_MODULE_ID 0x00 #define DLN2_GENERIC_CMD(cmd) DLN2_CMD(cmd, DLN2_GENERIC_MODULE_ID) #define CMD_GET_DEVICE_VER DLN2_GENERIC_CMD(0x30) #define CMD_GET_DEVICE_SN DLN2_GENERIC_CMD(0x31) #define DLN2_HW_ID 0x200 #define DLN2_USB_TIMEOUT 200 /* in ms */ #define DLN2_MAX_RX_SLOTS 16 #define DLN2_MAX_URBS 16 #define DLN2_RX_BUF_SIZE 512 enum dln2_handle { DLN2_HANDLE_EVENT = 0, /* don't change, hardware defined */ DLN2_HANDLE_CTRL, DLN2_HANDLE_GPIO, DLN2_HANDLE_I2C, DLN2_HANDLE_SPI, DLN2_HANDLE_ADC, DLN2_HANDLES }; /* * Receive context used between the receive demultiplexer and the transfer * routine. While sending a request the transfer routine will look for a free * receive context and use it to wait for a response and to receive the URB and * thus the response data. */ struct dln2_rx_context { /* completion used to wait for a response */ struct completion done; /* if non-NULL the URB contains the response */ struct urb *urb; /* if true then this context is used to wait for a response */ bool in_use; }; /* * Receive contexts for a particular DLN2 module (i2c, gpio, etc.). We use the * handle header field to identify the module in dln2_dev.mod_rx_slots and then * the echo header field to index the slots field and find the receive context * for a particular request. */ struct dln2_mod_rx_slots { /* RX slots bitmap */ DECLARE_BITMAP(bmap, DLN2_MAX_RX_SLOTS); /* used to wait for a free RX slot */ wait_queue_head_t wq; /* used to wait for an RX operation to complete */ struct dln2_rx_context slots[DLN2_MAX_RX_SLOTS]; /* avoid races between alloc/free_rx_slot and dln2_rx_transfer */ spinlock_t lock; }; struct dln2_dev { struct usb_device *usb_dev; struct usb_interface *interface; u8 ep_in; u8 ep_out; struct urb *rx_urb[DLN2_MAX_URBS]; void *rx_buf[DLN2_MAX_URBS]; struct dln2_mod_rx_slots mod_rx_slots[DLN2_HANDLES]; struct list_head event_cb_list; spinlock_t event_cb_lock; bool disconnect; int active_transfers; wait_queue_head_t disconnect_wq; spinlock_t disconnect_lock; }; struct dln2_event_cb_entry { struct list_head list; u16 id; struct platform_device *pdev; dln2_event_cb_t callback; }; int dln2_register_event_cb(struct platform_device *pdev, u16 id, dln2_event_cb_t event_cb) { struct dln2_dev *dln2 = dev_get_drvdata(pdev->dev.parent); struct dln2_event_cb_entry *i, *entry; unsigned long flags; int ret = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->id = id; entry->callback = event_cb; entry->pdev = pdev; spin_lock_irqsave(&dln2->event_cb_lock, flags); list_for_each_entry(i, &dln2->event_cb_list, list) { if (i->id == id) { ret = -EBUSY; break; } } if (!ret) list_add_rcu(&entry->list, &dln2->event_cb_list); spin_unlock_irqrestore(&dln2->event_cb_lock, flags); if (ret) kfree(entry); return ret; } EXPORT_SYMBOL(dln2_register_event_cb); void dln2_unregister_event_cb(struct platform_device *pdev, u16 id) { struct dln2_dev *dln2 = dev_get_drvdata(pdev->dev.parent); struct dln2_event_cb_entry *i; unsigned long flags; bool found = false; spin_lock_irqsave(&dln2->event_cb_lock, flags); list_for_each_entry(i, &dln2->event_cb_list, list) { if (i->id == id) { list_del_rcu(&i->list); found = true; break; } } spin_unlock_irqrestore(&dln2->event_cb_lock, flags); if (found) { synchronize_rcu(); kfree(i); } } EXPORT_SYMBOL(dln2_unregister_event_cb); /* * Returns true if a valid transfer slot is found. In this case the URB must not * be resubmitted immediately in dln2_rx as we need the data when dln2_transfer * is woke up. It will be resubmitted there. */ static bool dln2_transfer_complete(struct dln2_dev *dln2, struct urb *urb, u16 handle, u16 rx_slot) { struct device *dev = &dln2->interface->dev; struct dln2_mod_rx_slots *rxs = &dln2->mod_rx_slots[handle]; struct dln2_rx_context *rxc; unsigned long flags; bool valid_slot = false; if (rx_slot >= DLN2_MAX_RX_SLOTS) goto out; rxc = &rxs->slots[rx_slot]; spin_lock_irqsave(&rxs->lock, flags); if (rxc->in_use && !rxc->urb) { rxc->urb = urb; complete(&rxc->done); valid_slot = true; } spin_unlock_irqrestore(&rxs->lock, flags); out: if (!valid_slot) dev_warn(dev, "bad/late response %d/%d\n", handle, rx_slot); return valid_slot; } static void dln2_run_event_callbacks(struct dln2_dev *dln2, u16 id, u16 echo, void *data, int len) { struct dln2_event_cb_entry *i; rcu_read_lock(); list_for_each_entry_rcu(i, &dln2->event_cb_list, list) { if (i->id == id) { i->callback(i->pdev, echo, data, len); break; } } rcu_read_unlock(); } static void dln2_rx(struct urb *urb) { struct dln2_dev *dln2 = urb->context; struct dln2_header *hdr = urb->transfer_buffer; struct device *dev = &dln2->interface->dev; u16 id, echo, handle, size; u8 *data; int len; int err; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: case -EPIPE: /* this urb is terminated, clean up */ dev_dbg(dev, "urb shutting down with status %d\n", urb->status); return; default: dev_dbg(dev, "nonzero urb status received %d\n", urb->status); goto out; } if (urb->actual_length < sizeof(struct dln2_header)) { dev_err(dev, "short response: %d\n", urb->actual_length); goto out; } handle = le16_to_cpu(hdr->handle); id = le16_to_cpu(hdr->id); echo = le16_to_cpu(hdr->echo); size = le16_to_cpu(hdr->size); if (size != urb->actual_length) { dev_err(dev, "size mismatch: handle %x cmd %x echo %x size %d actual %d\n", handle, id, echo, size, urb->actual_length); goto out; } if (handle >= DLN2_HANDLES) { dev_warn(dev, "invalid handle %d\n", handle); goto out; } data = urb->transfer_buffer + sizeof(struct dln2_header); len = urb->actual_length - sizeof(struct dln2_header); if (handle == DLN2_HANDLE_EVENT) { unsigned long flags; spin_lock_irqsave(&dln2->event_cb_lock, flags); dln2_run_event_callbacks(dln2, id, echo, data, len); spin_unlock_irqrestore(&dln2->event_cb_lock, flags); } else { /* URB will be re-submitted in _dln2_transfer (free_rx_slot) */ if (dln2_transfer_complete(dln2, urb, handle, echo)) return; } out: err = usb_submit_urb(urb, GFP_ATOMIC); if (err < 0) dev_err(dev, "failed to resubmit RX URB: %d\n", err); } static void *dln2_prep_buf(u16 handle, u16 cmd, u16 echo, const void *obuf, int *obuf_len, gfp_t gfp) { int len; void *buf; struct dln2_header *hdr; len = *obuf_len + sizeof(*hdr); buf = kmalloc(len, gfp); if (!buf) return NULL; hdr = (struct dln2_header *)buf; hdr->id = cpu_to_le16(cmd); hdr->size = cpu_to_le16(len); hdr->echo = cpu_to_le16(echo); hdr->handle = cpu_to_le16(handle); memcpy(buf + sizeof(*hdr), obuf, *obuf_len); *obuf_len = len; return buf; } static int dln2_send_wait(struct dln2_dev *dln2, u16 handle, u16 cmd, u16 echo, const void *obuf, int obuf_len) { int ret = 0; int len = obuf_len; void *buf; int actual; buf = dln2_prep_buf(handle, cmd, echo, obuf, &len, GFP_KERNEL); if (!buf) return -ENOMEM; ret = usb_bulk_msg(dln2->usb_dev, usb_sndbulkpipe(dln2->usb_dev, dln2->ep_out), buf, len, &actual, DLN2_USB_TIMEOUT); kfree(buf); return ret; } static bool find_free_slot(struct dln2_dev *dln2, u16 handle, int *slot) { struct dln2_mod_rx_slots *rxs; unsigned long flags; if (dln2->disconnect) { *slot = -ENODEV; return true; } rxs = &dln2->mod_rx_slots[handle]; spin_lock_irqsave(&rxs->lock, flags); *slot = find_first_zero_bit(rxs->bmap, DLN2_MAX_RX_SLOTS); if (*slot < DLN2_MAX_RX_SLOTS) { struct dln2_rx_context *rxc = &rxs->slots[*slot]; set_bit(*slot, rxs->bmap); rxc->in_use = true; } spin_unlock_irqrestore(&rxs->lock, flags); return *slot < DLN2_MAX_RX_SLOTS; } static int alloc_rx_slot(struct dln2_dev *dln2, u16 handle) { int ret; int slot; /* * No need to timeout here, the wait is bounded by the timeout in * _dln2_transfer. */ ret = wait_event_interruptible(dln2->mod_rx_slots[handle].wq, find_free_slot(dln2, handle, &slot)); if (ret < 0) return ret; return slot; } static void free_rx_slot(struct dln2_dev *dln2, u16 handle, int slot) { struct dln2_mod_rx_slots *rxs; struct urb *urb = NULL; unsigned long flags; struct dln2_rx_context *rxc; rxs = &dln2->mod_rx_slots[handle]; spin_lock_irqsave(&rxs->lock, flags); clear_bit(slot, rxs->bmap); rxc = &rxs->slots[slot]; rxc->in_use = false; urb = rxc->urb; rxc->urb = NULL; reinit_completion(&rxc->done); spin_unlock_irqrestore(&rxs->lock, flags); if (urb) { int err; struct device *dev = &dln2->interface->dev; err = usb_submit_urb(urb, GFP_KERNEL); if (err < 0) dev_err(dev, "failed to resubmit RX URB: %d\n", err); } wake_up_interruptible(&rxs->wq); } static int _dln2_transfer(struct dln2_dev *dln2, u16 handle, u16 cmd, const void *obuf, unsigned obuf_len, void *ibuf, unsigned *ibuf_len) { int ret = 0; int rx_slot; struct dln2_response *rsp; struct dln2_rx_context *rxc; struct device *dev = &dln2->interface->dev; const unsigned long timeout = msecs_to_jiffies(DLN2_USB_TIMEOUT); struct dln2_mod_rx_slots *rxs = &dln2->mod_rx_slots[handle]; int size; spin_lock(&dln2->disconnect_lock); if (!dln2->disconnect) dln2->active_transfers++; else ret = -ENODEV; spin_unlock(&dln2->disconnect_lock); if (ret) return ret; rx_slot = alloc_rx_slot(dln2, handle); if (rx_slot < 0) { ret = rx_slot; goto out_decr; } ret = dln2_send_wait(dln2, handle, cmd, rx_slot, obuf, obuf_len); if (ret < 0) { dev_err(dev, "USB write failed: %d\n", ret); goto out_free_rx_slot; } rxc = &rxs->slots[rx_slot]; ret = wait_for_completion_interruptible_timeout(&rxc->done, timeout); if (ret <= 0) { if (!ret) ret = -ETIMEDOUT; goto out_free_rx_slot; } else { ret = 0; } if (dln2->disconnect) { ret = -ENODEV; goto out_free_rx_slot; } /* if we got here we know that the response header has been checked */ rsp = rxc->urb->transfer_buffer; size = le16_to_cpu(rsp->hdr.size); if (size < sizeof(*rsp)) { ret = -EPROTO; goto out_free_rx_slot; } if (le16_to_cpu(rsp->result) > 0x80) { dev_dbg(dev, "%d received response with error %d\n", handle, le16_to_cpu(rsp->result)); ret = -EREMOTEIO; goto out_free_rx_slot; } if (!ibuf) goto out_free_rx_slot; if (*ibuf_len > size - sizeof(*rsp)) *ibuf_len = size - sizeof(*rsp); memcpy(ibuf, rsp + 1, *ibuf_len); out_free_rx_slot: free_rx_slot(dln2, handle, rx_slot); out_decr: spin_lock(&dln2->disconnect_lock); dln2->active_transfers--; spin_unlock(&dln2->disconnect_lock); if (dln2->disconnect) wake_up(&dln2->disconnect_wq); return ret; } int dln2_transfer(struct platform_device *pdev, u16 cmd, const void *obuf, unsigned obuf_len, void *ibuf, unsigned *ibuf_len) { struct dln2_platform_data *dln2_pdata; struct dln2_dev *dln2; u16 handle; dln2 = dev_get_drvdata(pdev->dev.parent); dln2_pdata = dev_get_platdata(&pdev->dev); handle = dln2_pdata->handle; return _dln2_transfer(dln2, handle, cmd, obuf, obuf_len, ibuf, ibuf_len); } EXPORT_SYMBOL(dln2_transfer); static int dln2_check_hw(struct dln2_dev *dln2) { int ret; __le32 hw_type; int len = sizeof(hw_type); ret = _dln2_transfer(dln2, DLN2_HANDLE_CTRL, CMD_GET_DEVICE_VER, NULL, 0, &hw_type, &len); if (ret < 0) return ret; if (len < sizeof(hw_type)) return -EREMOTEIO; if (le32_to_cpu(hw_type) != DLN2_HW_ID) { dev_err(&dln2->interface->dev, "Device ID 0x%x not supported\n", le32_to_cpu(hw_type)); return -ENODEV; } return 0; } static int dln2_print_serialno(struct dln2_dev *dln2) { int ret; __le32 serial_no; int len = sizeof(serial_no); struct device *dev = &dln2->interface->dev; ret = _dln2_transfer(dln2, DLN2_HANDLE_CTRL, CMD_GET_DEVICE_SN, NULL, 0, &serial_no, &len); if (ret < 0) return ret; if (len < sizeof(serial_no)) return -EREMOTEIO; dev_info(dev, "Diolan DLN2 serial %u\n", le32_to_cpu(serial_no)); return 0; } static int dln2_hw_init(struct dln2_dev *dln2) { int ret; ret = dln2_check_hw(dln2); if (ret < 0) return ret; return dln2_print_serialno(dln2); } static void dln2_free_rx_urbs(struct dln2_dev *dln2) { int i; for (i = 0; i < DLN2_MAX_URBS; i++) { usb_free_urb(dln2->rx_urb[i]); kfree(dln2->rx_buf[i]); } } static void dln2_stop_rx_urbs(struct dln2_dev *dln2) { int i; for (i = 0; i < DLN2_MAX_URBS; i++) usb_kill_urb(dln2->rx_urb[i]); } static void dln2_free(struct dln2_dev *dln2) { dln2_free_rx_urbs(dln2); usb_put_dev(dln2->usb_dev); kfree(dln2); } static int dln2_setup_rx_urbs(struct dln2_dev *dln2, struct usb_host_interface *hostif) { int i; const int rx_max_size = DLN2_RX_BUF_SIZE; for (i = 0; i < DLN2_MAX_URBS; i++) { dln2->rx_buf[i] = kmalloc(rx_max_size, GFP_KERNEL); if (!dln2->rx_buf[i]) return -ENOMEM; dln2->rx_urb[i] = usb_alloc_urb(0, GFP_KERNEL); if (!dln2->rx_urb[i]) return -ENOMEM; usb_fill_bulk_urb(dln2->rx_urb[i], dln2->usb_dev, usb_rcvbulkpipe(dln2->usb_dev, dln2->ep_in), dln2->rx_buf[i], rx_max_size, dln2_rx, dln2); } return 0; } static int dln2_start_rx_urbs(struct dln2_dev *dln2, gfp_t gfp) { struct device *dev = &dln2->interface->dev; int ret; int i; for (i = 0; i < DLN2_MAX_URBS; i++) { ret = usb_submit_urb(dln2->rx_urb[i], gfp); if (ret < 0) { dev_err(dev, "failed to submit RX URB: %d\n", ret); return ret; } } return 0; } enum { DLN2_ACPI_MATCH_GPIO = 0, DLN2_ACPI_MATCH_I2C = 1, DLN2_ACPI_MATCH_SPI = 2, DLN2_ACPI_MATCH_ADC = 3, }; static struct dln2_platform_data dln2_pdata_gpio = { .handle = DLN2_HANDLE_GPIO, }; static struct mfd_cell_acpi_match dln2_acpi_match_gpio = { .adr = DLN2_ACPI_MATCH_GPIO, }; /* Only one I2C port seems to be supported on current hardware */ static struct dln2_platform_data dln2_pdata_i2c = { .handle = DLN2_HANDLE_I2C, .port = 0, }; static struct mfd_cell_acpi_match dln2_acpi_match_i2c = { .adr = DLN2_ACPI_MATCH_I2C, }; /* Only one SPI port supported */ static struct dln2_platform_data dln2_pdata_spi = { .handle = DLN2_HANDLE_SPI, .port = 0, }; static struct mfd_cell_acpi_match dln2_acpi_match_spi = { .adr = DLN2_ACPI_MATCH_SPI, }; /* Only one ADC port supported */ static struct dln2_platform_data dln2_pdata_adc = { .handle = DLN2_HANDLE_ADC, .port = 0, }; static struct mfd_cell_acpi_match dln2_acpi_match_adc = { .adr = DLN2_ACPI_MATCH_ADC, }; static const struct mfd_cell dln2_devs[] = { { .name = "dln2-gpio", .acpi_match = &dln2_acpi_match_gpio, .platform_data = &dln2_pdata_gpio, .pdata_size = sizeof(struct dln2_platform_data), }, { .name = "dln2-i2c", .acpi_match = &dln2_acpi_match_i2c, .platform_data = &dln2_pdata_i2c, .pdata_size = sizeof(struct dln2_platform_data), }, { .name = "dln2-spi", .acpi_match = &dln2_acpi_match_spi, .platform_data = &dln2_pdata_spi, .pdata_size = sizeof(struct dln2_platform_data), }, { .name = "dln2-adc", .acpi_match = &dln2_acpi_match_adc, .platform_data = &dln2_pdata_adc, .pdata_size = sizeof(struct dln2_platform_data), }, }; static void dln2_stop(struct dln2_dev *dln2) { int i, j; /* don't allow starting new transfers */ spin_lock(&dln2->disconnect_lock); dln2->disconnect = true; spin_unlock(&dln2->disconnect_lock); /* cancel in progress transfers */ for (i = 0; i < DLN2_HANDLES; i++) { struct dln2_mod_rx_slots *rxs = &dln2->mod_rx_slots[i]; unsigned long flags; spin_lock_irqsave(&rxs->lock, flags); /* cancel all response waiters */ for (j = 0; j < DLN2_MAX_RX_SLOTS; j++) { struct dln2_rx_context *rxc = &rxs->slots[j]; if (rxc->in_use) complete(&rxc->done); } spin_unlock_irqrestore(&rxs->lock, flags); } /* wait for transfers to end */ wait_event(dln2->disconnect_wq, !dln2->active_transfers); dln2_stop_rx_urbs(dln2); } static void dln2_disconnect(struct usb_interface *interface) { struct dln2_dev *dln2 = usb_get_intfdata(interface); dln2_stop(dln2); mfd_remove_devices(&interface->dev); dln2_free(dln2); } static int dln2_probe(struct usb_interface *interface, const struct usb_device_id *usb_id) { struct usb_host_interface *hostif = interface->cur_altsetting; struct usb_endpoint_descriptor *epin; struct usb_endpoint_descriptor *epout; struct device *dev = &interface->dev; struct dln2_dev *dln2; int ret; int i, j; if (hostif->desc.bInterfaceNumber != 0) return -ENODEV; ret = usb_find_common_endpoints(hostif, &epin, &epout, NULL, NULL); if (ret) return ret; dln2 = kzalloc(sizeof(*dln2), GFP_KERNEL); if (!dln2) return -ENOMEM; dln2->ep_out = epout->bEndpointAddress; dln2->ep_in = epin->bEndpointAddress; dln2->usb_dev = usb_get_dev(interface_to_usbdev(interface)); dln2->interface = interface; usb_set_intfdata(interface, dln2); init_waitqueue_head(&dln2->disconnect_wq); for (i = 0; i < DLN2_HANDLES; i++) { init_waitqueue_head(&dln2->mod_rx_slots[i].wq); spin_lock_init(&dln2->mod_rx_slots[i].lock); for (j = 0; j < DLN2_MAX_RX_SLOTS; j++) init_completion(&dln2->mod_rx_slots[i].slots[j].done); } spin_lock_init(&dln2->event_cb_lock); spin_lock_init(&dln2->disconnect_lock); INIT_LIST_HEAD(&dln2->event_cb_list); ret = dln2_setup_rx_urbs(dln2, hostif); if (ret) goto out_free; ret = dln2_start_rx_urbs(dln2, GFP_KERNEL); if (ret) goto out_stop_rx; ret = dln2_hw_init(dln2); if (ret < 0) { dev_err(dev, "failed to initialize hardware\n"); goto out_stop_rx; } ret = mfd_add_hotplug_devices(dev, dln2_devs, ARRAY_SIZE(dln2_devs)); if (ret != 0) { dev_err(dev, "failed to add mfd devices to core\n"); goto out_stop_rx; } return 0; out_stop_rx: dln2_stop_rx_urbs(dln2); out_free: usb_put_dev(dln2->usb_dev); dln2_free(dln2); return ret; } static int dln2_suspend(struct usb_interface *iface, pm_message_t message) { struct dln2_dev *dln2 = usb_get_intfdata(iface); dln2_stop(dln2); return 0; } static int dln2_resume(struct usb_interface *iface) { struct dln2_dev *dln2 = usb_get_intfdata(iface); dln2->disconnect = false; return dln2_start_rx_urbs(dln2, GFP_NOIO); } static const struct usb_device_id dln2_table[] = { { USB_DEVICE(0xa257, 0x2013) }, { } }; MODULE_DEVICE_TABLE(usb, dln2_table); static struct usb_driver dln2_driver = { .name = "dln2", .probe = dln2_probe, .disconnect = dln2_disconnect, .id_table = dln2_table, .suspend = dln2_suspend, .resume = dln2_resume, }; module_usb_driver(dln2_driver); MODULE_AUTHOR("Octavian Purdila <octavian.purdila@intel.com>"); MODULE_DESCRIPTION("Core driver for the Diolan DLN2 interface adapter"); MODULE_LICENSE("GPL v2"); |
| 235 235 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 | // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; while (--i >= 0) kfree(siw_cpu_info.tx_valid_cpus[i]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; return -ENOMEM; } static void siw_destroy_cpulist(void) { int i = 0; while (i < siw_cpu_info.num_nodes) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } /* * Network link becomes unavailable. Mark all * affected QP's accordingly. */ static void siw_netdev_down(struct work_struct *work) { struct siw_device *sdev = container_of(work, struct siw_device, netdev_down); struct siw_qp_attrs qp_attrs; struct list_head *pos, *tmp; memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.state = SIW_QP_STATE_ERROR; list_for_each_safe(pos, tmp, &sdev->qp_list) { struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); down_write(&qp->state_lock); WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); up_write(&qp->state_lock); } ib_device_put(&sdev->base_dev); } static void siw_device_goes_down(struct siw_device *sdev) { if (ib_device_try_get(&sdev->base_dev)) { INIT_WORK(&sdev->netdev_down, siw_netdev_down); schedule_work(&sdev->netdev_down); } } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_UP: sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_GOING_DOWN: siw_device_goes_down(sdev); break; case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: case NETDEV_CHANGE: break; default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); if (netif_running(netdev) && netif_carrier_ok(netdev)) sdev->state = IB_PORT_ACTIVE; else sdev->state = IB_PORT_DOWN; rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } /* * Locate CRC32 algorithm. If unsuccessful, fail * loading siw only, if CRC is required. */ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(siw_crypto_shash)) { pr_info("siw: Loading CRC32c failed: %ld\n", PTR_ERR(siw_crypto_shash)); siw_crypto_shash = NULL; if (mpa_crc_required) { rv = -EOPNOTSUPP; goto out_error; } } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw"); |
| 18 919 1051 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H /* * Linux wait queue related types and methods */ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> #include <asm/current.h> #include <uapi/linux/wait.h> typedef struct wait_queue_entry wait_queue_entry_t; typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_BOOKMARK 0x04 #define WQ_FLAG_CUSTOM 0x08 #define WQ_FLAG_DONE 0x10 #define WQ_FLAG_PRIORITY 0x20 /* * A single wait-queue entry structure: */ struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; struct list_head entry; }; struct wait_queue_head { spinlock_t lock; struct list_head head; }; typedef struct wait_queue_head wait_queue_head_t; struct task_struct; /* * Macros for declaration and initialisaton of the datatypes */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .entry = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); #define init_waitqueue_head(wq_head) \ do { \ static struct lock_class_key __key; \ \ __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { wq_entry->flags = 0; wq_entry->private = p; wq_entry->func = default_wake_function; } static inline void init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { wq_entry->flags = 0; wq_entry->private = NULL; wq_entry->func = func; } /** * waitqueue_active -- locklessly test for waiters on the queue * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like:: * * CPU0 - waker CPU1 - waiter * * for (;;) { * @cond = true; prepare_to_wait(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (waitqueue_active(wq_head)) if (@cond) * wake_up(wq_head); break; * schedule(); * } * finish_wait(&wq_head, &wait); * * Because without the explicit smp_mb() it's possible for the * waitqueue_active() load to get hoisted over the @cond store such that we'll * observe an empty wait list while the waiter might not observe @cond. * * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ static inline int waitqueue_active(struct wait_queue_head *wq_head) { return !list_empty(&wq_head->head); } /** * wq_has_single_sleeper - check if there is only one sleeper * @wq_head: wait queue head * * Returns true of wq_head has only one sleeper on the list. * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) { return list_is_singular(&wq_head->head); } /** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head * * Returns true if wq_head has waiting processes * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier should be paired with one on the * waiting side. */ smp_mb(); return waitqueue_active(wq_head); } extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { struct list_head *head = &wq_head->head; struct wait_queue_entry *wq; list_for_each_entry(wq, &wq_head->head, entry) { if (!(wq->flags & WQ_FLAG_PRIORITY)) break; head = &wq->entry; } list_add(&wq_entry->entry, head); } /* * Used for wake-one threads: */ static inline void __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq_head, wq_entry); } static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_del(&wq_entry->entry); } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) /* * Wakeup macros to be used to report events to the targets. */ #define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m)) #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) #define wake_up_poll_on_current_cpu(x, m) \ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) /** * wake_up_pollfree - signal that a polled waitqueue is going away * @wq_head: the wait queue head * * In the very rare cases where a ->poll() implementation uses a waitqueue whose * lifetime is tied to a task rather than to the 'struct file' being polled, * this function must be called before the waitqueue is freed so that * non-blocking polls (e.g. epoll) are notified that the queue is going away. * * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. */ static inline void wake_up_pollfree(struct wait_queue_head *wq_head) { /* * For performance reasons, we don't always take the queue lock here. * Therefore, we might race with someone removing the last entry from * the queue, and proceed while they still hold the queue lock. * However, rcu_read_lock() is required to be held in such cases, so we * can safely proceed with an RCU-delayed free. */ if (waitqueue_active(wq_head)) __wake_up_pollfree(wq_head); } #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ if (__cond && !__ret) \ __ret = 1; \ __cond || !__ret; \ }) #define ___wait_is_interruptible(state) \ (!__builtin_constant_p(state) || \ (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. * * This is so that both can use the ___wait_cond_timeout() construct * to wrap the condition. * * The type inconsistency of the wait_event_*() __ret variable is also * on purpose; we use long where we can return timeout values and int * otherwise. */ #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_entry __wq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ }) #define __wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_event(wq_head, condition); \ } while (0) #define __io_wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /* * io_wait_event() -- like wait_event() but with io_schedule() */ #define io_wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __io_wait_event(wq_head, condition); \ } while (0) #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute * to system load) until the @condition evaluates to true. The * @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_freezable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable(wq_head, condition); \ __ret; \ }) #define __wait_event_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid * increasing load and is freezable. */ #define wait_event_freezable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ cmd1; schedule(); cmd2) /* * Just like wait_event_cmd(), except it sets exclusive flag */ #define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) /** * wait_event_cmd - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @cmd1: the command will be executed before sleep * @cmd2: the command will be executed after sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_interruptible(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event_interruptible - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq_head, condition); \ __ret; \ }) #define __wait_event_interruptible_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ ({ \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL); \ } \ \ __ret = ___wait_event(wq_head, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ __ret; \ }) /** * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, or -ETIME if the timeout * elapsed. */ #define wait_event_hrtimeout(wq_head, condition, timeout) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, -ERESTARTSYS if it was * interrupted by a signal, or -ETIME if the timeout elapsed. */ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define __wait_event_interruptible_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_killable_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ schedule()) #define wait_event_killable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_freezable_exclusive(wq, condition) \ ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable_exclusive(wq, condition); \ __ret; \ }) /** * wait_event_idle - wait for a condition without contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule()); \ } while (0) /** * wait_event_idle_exclusive - wait for a condition with contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle_exclusive(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule()); \ } while (0) #define __wait_event_idle_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 1, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\ __ret; \ }) extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ __ret = fn(&(wq), &__wait); \ if (__ret) \ break; \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ __ret; \ }) /** * wait_event_interruptible_locked - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_killable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq_head, condition); \ __ret; \ }) #define __wait_event_state(wq, condition, state) \ ___wait_event(wq, condition, state, 0, 0, schedule()) /** * wait_event_state - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @state: state to sleep in * * The process is put to sleep (@state) until the @condition evaluates to true * or a signal is received (when allowed by @state). The @condition is checked * each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a signal * (when allowed by @state) and 0 if @condition evaluated to true. */ #define wait_event_state(wq_head, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state(wq_head, condition, state); \ __ret; \ }) #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a kill signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a kill signal. * * Only kill signals interrupt this process. */ #define wait_event_killable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_killable_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd * and schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. */ #define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) /** * wait_event_lock_irq - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. */ #define wait_event_lock_irq(wq_head, condition, lock) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. * The condition is checked under the lock. This is expected to * be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd and * schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock, cmd); \ __ret; \ }) /** * wait_event_interruptible_lock_irq - sleep until a condition gets true. * The condition is checked under the lock. This is expected * to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq(wq_head, condition, lock) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock,); \ __ret; \ }) #define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ state, 0, timeout, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets * true or a timeout elapses. The condition is checked under * the lock. This is expected to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ #define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \ timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ struct wait_queue_entry name = { \ .private = current, \ .func = function, \ .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define init_wait(wait) \ do { \ (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */ |
| 1 4 4 3 4 4 14 14 14 14 14 14 14 103 134 113 2 2 21 17 130 100 5 98 100 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Paul Stewart * Copyright (c) 2001 Vojtech Pavlik * * HID char devices, giving access to raw HID device events. */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to Paul Stewart <stewart@wetlogic.net> */ #include <linux/poll.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> #include <linux/usb.h> #include <linux/hid.h> #include <linux/hiddev.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/nospec.h> #include "usbhid.h" #ifdef CONFIG_USB_DYNAMIC_MINORS #define HIDDEV_MINOR_BASE 0 #define HIDDEV_MINORS 256 #else #define HIDDEV_MINOR_BASE 96 #define HIDDEV_MINORS 16 #endif #define HIDDEV_BUFFER_SIZE 2048 struct hiddev_list { struct hiddev_usage_ref buffer[HIDDEV_BUFFER_SIZE]; int head; int tail; unsigned flags; struct fasync_struct *fasync; struct hiddev *hiddev; struct list_head node; struct mutex thread_lock; }; /* * Find a report, given the report's type and ID. The ID can be specified * indirectly by REPORT_ID_FIRST (which returns the first report of the given * type) or by (REPORT_ID_NEXT | old_id), which returns the next report of the * given type which follows old_id. */ static struct hid_report * hiddev_lookup_report(struct hid_device *hid, struct hiddev_report_info *rinfo) { unsigned int flags = rinfo->report_id & ~HID_REPORT_ID_MASK; unsigned int rid = rinfo->report_id & HID_REPORT_ID_MASK; struct hid_report_enum *report_enum; struct hid_report *report; struct list_head *list; if (rinfo->report_type < HID_REPORT_TYPE_MIN || rinfo->report_type > HID_REPORT_TYPE_MAX) return NULL; report_enum = hid->report_enum + (rinfo->report_type - HID_REPORT_TYPE_MIN); switch (flags) { case 0: /* Nothing to do -- report_id is already set correctly */ break; case HID_REPORT_ID_FIRST: if (list_empty(&report_enum->report_list)) return NULL; list = report_enum->report_list.next; report = list_entry(list, struct hid_report, list); rinfo->report_id = report->id; break; case HID_REPORT_ID_NEXT: report = report_enum->report_id_hash[rid]; if (!report) return NULL; list = report->list.next; if (list == &report_enum->report_list) return NULL; report = list_entry(list, struct hid_report, list); rinfo->report_id = report->id; break; default: return NULL; } return report_enum->report_id_hash[rinfo->report_id]; } /* * Perform an exhaustive search of the report table for a usage, given its * type and usage id. */ static struct hid_field * hiddev_lookup_usage(struct hid_device *hid, struct hiddev_usage_ref *uref) { int i, j; struct hid_report *report; struct hid_report_enum *report_enum; struct hid_field *field; if (uref->report_type < HID_REPORT_TYPE_MIN || uref->report_type > HID_REPORT_TYPE_MAX) return NULL; report_enum = hid->report_enum + (uref->report_type - HID_REPORT_TYPE_MIN); list_for_each_entry(report, &report_enum->report_list, list) { for (i = 0; i < report->maxfield; i++) { field = report->field[i]; for (j = 0; j < field->maxusage; j++) { if (field->usage[j].hid == uref->usage_code) { uref->report_id = report->id; uref->field_index = i; uref->usage_index = j; return field; } } } } return NULL; } static void hiddev_send_event(struct hid_device *hid, struct hiddev_usage_ref *uref) { struct hiddev *hiddev = hid->hiddev; struct hiddev_list *list; unsigned long flags; spin_lock_irqsave(&hiddev->list_lock, flags); list_for_each_entry(list, &hiddev->list, node) { if (uref->field_index != HID_FIELD_INDEX_NONE || (list->flags & HIDDEV_FLAG_REPORT) != 0) { list->buffer[list->head] = *uref; list->head = (list->head + 1) & (HIDDEV_BUFFER_SIZE - 1); kill_fasync(&list->fasync, SIGIO, POLL_IN); } } spin_unlock_irqrestore(&hiddev->list_lock, flags); wake_up_interruptible(&hiddev->wait); } /* * This is where hid.c calls into hiddev to pass an event that occurred over * the interrupt pipe */ void hiddev_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) { unsigned type = field->report_type; struct hiddev_usage_ref uref; uref.report_type = (type == HID_INPUT_REPORT) ? HID_REPORT_TYPE_INPUT : ((type == HID_OUTPUT_REPORT) ? HID_REPORT_TYPE_OUTPUT : ((type == HID_FEATURE_REPORT) ? HID_REPORT_TYPE_FEATURE : 0)); uref.report_id = field->report->id; uref.field_index = field->index; uref.usage_index = (usage - field->usage); uref.usage_code = usage->hid; uref.value = value; hiddev_send_event(hid, &uref); } EXPORT_SYMBOL_GPL(hiddev_hid_event); void hiddev_report_event(struct hid_device *hid, struct hid_report *report) { unsigned type = report->type; struct hiddev_usage_ref uref; memset(&uref, 0, sizeof(uref)); uref.report_type = (type == HID_INPUT_REPORT) ? HID_REPORT_TYPE_INPUT : ((type == HID_OUTPUT_REPORT) ? HID_REPORT_TYPE_OUTPUT : ((type == HID_FEATURE_REPORT) ? HID_REPORT_TYPE_FEATURE : 0)); uref.report_id = report->id; uref.field_index = HID_FIELD_INDEX_NONE; hiddev_send_event(hid, &uref); } /* * fasync file op */ static int hiddev_fasync(int fd, struct file *file, int on) { struct hiddev_list *list = file->private_data; return fasync_helper(fd, file, on, &list->fasync); } /* * release file op */ static int hiddev_release(struct inode * inode, struct file * file) { struct hiddev_list *list = file->private_data; unsigned long flags; spin_lock_irqsave(&list->hiddev->list_lock, flags); list_del(&list->node); spin_unlock_irqrestore(&list->hiddev->list_lock, flags); mutex_lock(&list->hiddev->existancelock); if (!--list->hiddev->open) { if (list->hiddev->exist) { hid_hw_close(list->hiddev->hid); hid_hw_power(list->hiddev->hid, PM_HINT_NORMAL); } else { mutex_unlock(&list->hiddev->existancelock); kfree(list->hiddev); vfree(list); return 0; } } mutex_unlock(&list->hiddev->existancelock); vfree(list); return 0; } static int __hiddev_open(struct hiddev *hiddev, struct file *file) { struct hiddev_list *list; int error; lockdep_assert_held(&hiddev->existancelock); list = vzalloc(sizeof(*list)); if (!list) return -ENOMEM; mutex_init(&list->thread_lock); list->hiddev = hiddev; if (!hiddev->open++) { error = hid_hw_power(hiddev->hid, PM_HINT_FULLON); if (error < 0) goto err_drop_count; error = hid_hw_open(hiddev->hid); if (error < 0) goto err_normal_power; } spin_lock_irq(&hiddev->list_lock); list_add_tail(&list->node, &hiddev->list); spin_unlock_irq(&hiddev->list_lock); file->private_data = list; return 0; err_normal_power: hid_hw_power(hiddev->hid, PM_HINT_NORMAL); err_drop_count: hiddev->open--; vfree(list); return error; } /* * open file op */ static int hiddev_open(struct inode *inode, struct file *file) { struct usb_interface *intf; struct hid_device *hid; struct hiddev *hiddev; int res; intf = usbhid_find_interface(iminor(inode)); if (!intf) return -ENODEV; hid = usb_get_intfdata(intf); hiddev = hid->hiddev; mutex_lock(&hiddev->existancelock); res = hiddev->exist ? __hiddev_open(hiddev, file) : -ENODEV; mutex_unlock(&hiddev->existancelock); return res; } /* * "write" file op */ static ssize_t hiddev_write(struct file * file, const char __user * buffer, size_t count, loff_t *ppos) { return -EINVAL; } /* * "read" file op */ static ssize_t hiddev_read(struct file * file, char __user * buffer, size_t count, loff_t *ppos) { DEFINE_WAIT(wait); struct hiddev_list *list = file->private_data; int event_size; int retval; event_size = ((list->flags & HIDDEV_FLAG_UREF) != 0) ? sizeof(struct hiddev_usage_ref) : sizeof(struct hiddev_event); if (count < event_size) return 0; /* lock against other threads */ retval = mutex_lock_interruptible(&list->thread_lock); if (retval) return -ERESTARTSYS; while (retval == 0) { if (list->head == list->tail) { prepare_to_wait(&list->hiddev->wait, &wait, TASK_INTERRUPTIBLE); while (list->head == list->tail) { if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (!list->hiddev->exist) { retval = -EIO; break; } if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; break; } /* let O_NONBLOCK tasks run */ mutex_unlock(&list->thread_lock); schedule(); if (mutex_lock_interruptible(&list->thread_lock)) { finish_wait(&list->hiddev->wait, &wait); return -EINTR; } set_current_state(TASK_INTERRUPTIBLE); } finish_wait(&list->hiddev->wait, &wait); } if (retval) { mutex_unlock(&list->thread_lock); return retval; } while (list->head != list->tail && retval + event_size <= count) { if ((list->flags & HIDDEV_FLAG_UREF) == 0) { if (list->buffer[list->tail].field_index != HID_FIELD_INDEX_NONE) { struct hiddev_event event; event.hid = list->buffer[list->tail].usage_code; event.value = list->buffer[list->tail].value; if (copy_to_user(buffer + retval, &event, sizeof(struct hiddev_event))) { mutex_unlock(&list->thread_lock); return -EFAULT; } retval += sizeof(struct hiddev_event); } } else { if (list->buffer[list->tail].field_index != HID_FIELD_INDEX_NONE || (list->flags & HIDDEV_FLAG_REPORT) != 0) { if (copy_to_user(buffer + retval, list->buffer + list->tail, sizeof(struct hiddev_usage_ref))) { mutex_unlock(&list->thread_lock); return -EFAULT; } retval += sizeof(struct hiddev_usage_ref); } } list->tail = (list->tail + 1) & (HIDDEV_BUFFER_SIZE - 1); } } mutex_unlock(&list->thread_lock); return retval; } /* * "poll" file op * No kernel lock - fine */ static __poll_t hiddev_poll(struct file *file, poll_table *wait) { struct hiddev_list *list = file->private_data; poll_wait(file, &list->hiddev->wait, wait); if (list->head != list->tail) return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hiddev->exist) return EPOLLERR | EPOLLHUP; return 0; } /* * "ioctl" file op */ static noinline int hiddev_ioctl_usage(struct hiddev *hiddev, unsigned int cmd, void __user *user_arg) { struct hid_device *hid = hiddev->hid; struct hiddev_report_info rinfo; struct hiddev_usage_ref_multi *uref_multi = NULL; struct hiddev_usage_ref *uref; struct hid_report *report; struct hid_field *field; int i; uref_multi = kmalloc(sizeof(struct hiddev_usage_ref_multi), GFP_KERNEL); if (!uref_multi) return -ENOMEM; uref = &uref_multi->uref; if (cmd == HIDIOCGUSAGES || cmd == HIDIOCSUSAGES) { if (copy_from_user(uref_multi, user_arg, sizeof(*uref_multi))) goto fault; } else { if (copy_from_user(uref, user_arg, sizeof(*uref))) goto fault; } switch (cmd) { case HIDIOCGUCODE: rinfo.report_type = uref->report_type; rinfo.report_id = uref->report_id; if ((report = hiddev_lookup_report(hid, &rinfo)) == NULL) goto inval; if (uref->field_index >= report->maxfield) goto inval; uref->field_index = array_index_nospec(uref->field_index, report->maxfield); field = report->field[uref->field_index]; if (uref->usage_index >= field->maxusage) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->maxusage); uref->usage_code = field->usage[uref->usage_index].hid; if (copy_to_user(user_arg, uref, sizeof(*uref))) goto fault; goto goodreturn; default: if (cmd != HIDIOCGUSAGE && cmd != HIDIOCGUSAGES && uref->report_type == HID_REPORT_TYPE_INPUT) goto inval; if (uref->report_id == HID_REPORT_ID_UNKNOWN) { field = hiddev_lookup_usage(hid, uref); if (field == NULL) goto inval; } else { rinfo.report_type = uref->report_type; rinfo.report_id = uref->report_id; if ((report = hiddev_lookup_report(hid, &rinfo)) == NULL) goto inval; if (uref->field_index >= report->maxfield) goto inval; uref->field_index = array_index_nospec(uref->field_index, report->maxfield); field = report->field[uref->field_index]; if (cmd == HIDIOCGCOLLECTIONINDEX) { if (uref->usage_index >= field->maxusage) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->maxusage); } else if (uref->usage_index >= field->report_count) goto inval; } if (cmd == HIDIOCGUSAGES || cmd == HIDIOCSUSAGES) { if (uref_multi->num_values > HID_MAX_MULTI_USAGES || uref->usage_index + uref_multi->num_values > field->report_count) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->report_count - uref_multi->num_values); } switch (cmd) { case HIDIOCGUSAGE: if (uref->usage_index >= field->report_count) goto inval; uref->value = field->value[uref->usage_index]; if (copy_to_user(user_arg, uref, sizeof(*uref))) goto fault; goto goodreturn; case HIDIOCSUSAGE: if (uref->usage_index >= field->report_count) goto inval; field->value[uref->usage_index] = uref->value; goto goodreturn; case HIDIOCGCOLLECTIONINDEX: i = field->usage[uref->usage_index].collection_index; kfree(uref_multi); return i; case HIDIOCGUSAGES: for (i = 0; i < uref_multi->num_values; i++) uref_multi->values[i] = field->value[uref->usage_index + i]; if (copy_to_user(user_arg, uref_multi, sizeof(*uref_multi))) goto fault; goto goodreturn; case HIDIOCSUSAGES: for (i = 0; i < uref_multi->num_values; i++) field->value[uref->usage_index + i] = uref_multi->values[i]; goto goodreturn; } goodreturn: kfree(uref_multi); return 0; fault: kfree(uref_multi); return -EFAULT; inval: kfree(uref_multi); return -EINVAL; } } static noinline int hiddev_ioctl_string(struct hiddev *hiddev, unsigned int cmd, void __user *user_arg) { struct hid_device *hid = hiddev->hid; struct usb_device *dev = hid_to_usb_dev(hid); int idx, len; char *buf; if (get_user(idx, (int __user *)user_arg)) return -EFAULT; if ((buf = kmalloc(HID_STRING_SIZE, GFP_KERNEL)) == NULL) return -ENOMEM; if ((len = usb_string(dev, idx, buf, HID_STRING_SIZE-1)) < 0) { kfree(buf); return -EINVAL; } if (copy_to_user(user_arg+sizeof(int), buf, len+1)) { kfree(buf); return -EFAULT; } kfree(buf); return len; } static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hiddev_list *list = file->private_data; struct hiddev *hiddev = list->hiddev; struct hid_device *hid; struct hiddev_collection_info cinfo; struct hiddev_report_info rinfo; struct hiddev_field_info finfo; struct hiddev_devinfo dinfo; struct hid_report *report; struct hid_field *field; void __user *user_arg = (void __user *)arg; int i, r = -EINVAL; /* Called without BKL by compat methods so no BKL taken */ mutex_lock(&hiddev->existancelock); if (!hiddev->exist) { r = -ENODEV; goto ret_unlock; } hid = hiddev->hid; switch (cmd) { case HIDIOCGVERSION: r = put_user(HID_VERSION, (int __user *)arg) ? -EFAULT : 0; break; case HIDIOCAPPLICATION: if (arg >= hid->maxapplication) break; for (i = 0; i < hid->maxcollection; i++) if (hid->collection[i].type == HID_COLLECTION_APPLICATION && arg-- == 0) break; if (i < hid->maxcollection) r = hid->collection[i].usage; break; case HIDIOCGDEVINFO: { struct usb_device *dev = hid_to_usb_dev(hid); struct usbhid_device *usbhid = hid->driver_data; memset(&dinfo, 0, sizeof(dinfo)); dinfo.bustype = BUS_USB; dinfo.busnum = dev->bus->busnum; dinfo.devnum = dev->devnum; dinfo.ifnum = usbhid->ifnum; dinfo.vendor = le16_to_cpu(dev->descriptor.idVendor); dinfo.product = le16_to_cpu(dev->descriptor.idProduct); dinfo.version = le16_to_cpu(dev->descriptor.bcdDevice); dinfo.num_applications = hid->maxapplication; r = copy_to_user(user_arg, &dinfo, sizeof(dinfo)) ? -EFAULT : 0; break; } case HIDIOCGFLAG: r = put_user(list->flags, (int __user *)arg) ? -EFAULT : 0; break; case HIDIOCSFLAG: { int newflags; if (get_user(newflags, (int __user *)arg)) { r = -EFAULT; break; } if ((newflags & ~HIDDEV_FLAGS) != 0 || ((newflags & HIDDEV_FLAG_REPORT) != 0 && (newflags & HIDDEV_FLAG_UREF) == 0)) break; list->flags = newflags; r = 0; break; } case HIDIOCGSTRING: r = hiddev_ioctl_string(hiddev, cmd, user_arg); break; case HIDIOCINITREPORT: usbhid_init_reports(hid); hiddev->initialized = true; r = 0; break; case HIDIOCGREPORT: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } if (rinfo.report_type == HID_REPORT_TYPE_OUTPUT) break; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; hid_hw_request(hid, report, HID_REQ_GET_REPORT); hid_hw_wait(hid); r = 0; break; case HIDIOCSREPORT: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } if (rinfo.report_type == HID_REPORT_TYPE_INPUT) break; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; hid_hw_request(hid, report, HID_REQ_SET_REPORT); hid_hw_wait(hid); r = 0; break; case HIDIOCGREPORTINFO: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; rinfo.num_fields = report->maxfield; r = copy_to_user(user_arg, &rinfo, sizeof(rinfo)) ? -EFAULT : 0; break; case HIDIOCGFIELDINFO: if (copy_from_user(&finfo, user_arg, sizeof(finfo))) { r = -EFAULT; break; } rinfo.report_type = finfo.report_type; rinfo.report_id = finfo.report_id; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; if (finfo.field_index >= report->maxfield) break; finfo.field_index = array_index_nospec(finfo.field_index, report->maxfield); field = report->field[finfo.field_index]; memset(&finfo, 0, sizeof(finfo)); finfo.report_type = rinfo.report_type; finfo.report_id = rinfo.report_id; finfo.field_index = field->report_count - 1; finfo.maxusage = field->maxusage; finfo.flags = field->flags; finfo.physical = field->physical; finfo.logical = field->logical; finfo.application = field->application; finfo.logical_minimum = field->logical_minimum; finfo.logical_maximum = field->logical_maximum; finfo.physical_minimum = field->physical_minimum; finfo.physical_maximum = field->physical_maximum; finfo.unit_exponent = field->unit_exponent; finfo.unit = field->unit; r = copy_to_user(user_arg, &finfo, sizeof(finfo)) ? -EFAULT : 0; break; case HIDIOCGUCODE: case HIDIOCGUSAGE: case HIDIOCSUSAGE: case HIDIOCGUSAGES: case HIDIOCSUSAGES: case HIDIOCGCOLLECTIONINDEX: if (!hiddev->initialized) { usbhid_init_reports(hid); hiddev->initialized = true; } r = hiddev_ioctl_usage(hiddev, cmd, user_arg); break; case HIDIOCGCOLLECTIONINFO: if (copy_from_user(&cinfo, user_arg, sizeof(cinfo))) { r = -EFAULT; break; } if (cinfo.index >= hid->maxcollection) break; cinfo.index = array_index_nospec(cinfo.index, hid->maxcollection); cinfo.type = hid->collection[cinfo.index].type; cinfo.usage = hid->collection[cinfo.index].usage; cinfo.level = hid->collection[cinfo.index].level; r = copy_to_user(user_arg, &cinfo, sizeof(cinfo)) ? -EFAULT : 0; break; default: if (_IOC_TYPE(cmd) != 'H' || _IOC_DIR(cmd) != _IOC_READ) break; if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGNAME(0))) { int len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); r = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGPHYS(0))) { int len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); r = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; break; } } ret_unlock: mutex_unlock(&hiddev->existancelock); return r; } static const struct file_operations hiddev_fops = { .owner = THIS_MODULE, .read = hiddev_read, .write = hiddev_write, .poll = hiddev_poll, .open = hiddev_open, .release = hiddev_release, .unlocked_ioctl = hiddev_ioctl, .fasync = hiddev_fasync, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; static char *hiddev_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver hiddev_class = { .name = "hiddev%d", .devnode = hiddev_devnode, .fops = &hiddev_fops, .minor_base = HIDDEV_MINOR_BASE, }; /* * This is where hid.c calls us to connect a hid device to the hiddev driver */ int hiddev_connect(struct hid_device *hid, unsigned int force) { struct hiddev *hiddev; struct usbhid_device *usbhid = hid->driver_data; int retval; if (!force) { unsigned int i; for (i = 0; i < hid->maxcollection; i++) if (hid->collection[i].type == HID_COLLECTION_APPLICATION && !IS_INPUT_APPLICATION(hid->collection[i].usage)) break; if (i == hid->maxcollection) return -EINVAL; } if (!(hiddev = kzalloc(sizeof(struct hiddev), GFP_KERNEL))) return -ENOMEM; init_waitqueue_head(&hiddev->wait); INIT_LIST_HEAD(&hiddev->list); spin_lock_init(&hiddev->list_lock); mutex_init(&hiddev->existancelock); hid->hiddev = hiddev; hiddev->hid = hid; hiddev->exist = 1; retval = usb_register_dev(usbhid->intf, &hiddev_class); if (retval) { hid_err(hid, "Not able to get a minor for this device\n"); hid->hiddev = NULL; kfree(hiddev); return retval; } /* * If HID_QUIRK_NO_INIT_REPORTS is set, make sure we don't initialize * the reports. */ hiddev->initialized = hid->quirks & HID_QUIRK_NO_INIT_REPORTS; hiddev->minor = usbhid->intf->minor; return 0; } /* * This is where hid.c calls us to disconnect a hiddev device from the * corresponding hid device (usually because the usb device has disconnected) */ static struct usb_class_driver hiddev_class; void hiddev_disconnect(struct hid_device *hid) { struct hiddev *hiddev = hid->hiddev; struct usbhid_device *usbhid = hid->driver_data; usb_deregister_dev(usbhid->intf, &hiddev_class); mutex_lock(&hiddev->existancelock); hiddev->exist = 0; if (hiddev->open) { hid_hw_close(hiddev->hid); wake_up_interruptible(&hiddev->wait); mutex_unlock(&hiddev->existancelock); } else { mutex_unlock(&hiddev->existancelock); kfree(hiddev); } } |
| 2713 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 Chen Liqin <liqin.chen@sunplusct.com> * Copyright (C) 2012 Regents of the University of California */ #ifndef _ASM_RISCV_TLBFLUSH_H #define _ASM_RISCV_TLBFLUSH_H #include <linux/mm_types.h> #include <asm/smp.h> #include <asm/errata_list.h> #ifdef CONFIG_MMU extern unsigned long asid_mask; static inline void local_flush_tlb_all(void) { __asm__ __volatile__ ("sfence.vma" : : : "memory"); } /* Flush one page from local TLB */ static inline void local_flush_tlb_page(unsigned long addr) { ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory")); } #else /* CONFIG_MMU */ #define local_flush_tlb_all() do { } while (0) #define local_flush_tlb_page(addr) do { } while (0) #endif /* CONFIG_MMU */ #if defined(CONFIG_SMP) && defined(CONFIG_MMU) void flush_tlb_all(void); void flush_tlb_mm(struct mm_struct *mm); void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif #else /* CONFIG_SMP && CONFIG_MMU */ #define flush_tlb_all() local_flush_tlb_all() #define flush_tlb_page(vma, addr) local_flush_tlb_page(addr) static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { local_flush_tlb_all(); } #define flush_tlb_mm(mm) flush_tlb_all() #endif /* !CONFIG_SMP || !CONFIG_MMU */ /* Flush a range of kernel pages */ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { flush_tlb_all(); } #endif /* _ASM_RISCV_TLBFLUSH_H */ |
| 2 2 81 7 74 74 74 74 74 2 74 72 72 74 84 1 1 1 84 4 9 2 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for Kye/Genius devices not fully compliant with HID standard * * Copyright (c) 2009 Jiri Kosina * Copyright (c) 2009 Tomas Hanak * Copyright (c) 2012 Nikolai Kondrashov * Copyright (c) 2023 David Yang */ #include <asm-generic/unaligned.h> #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" /* Data gathered from Database/VID0458_PID????/Vista/TBoard/default.xml in ioTablet driver * * TODO: * - Add battery and sleep support for EasyPen M406W and MousePen M508WX * - Investigate ScrollZ.MiceFMT buttons of EasyPen M406 */ static const __u8 easypen_m406_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x45, 0x02, /* Usage (AC Rotate), */ 0x09, 0x40, /* Usage (Menu), */ 0x0A, 0x2F, 0x02, /* Usage (AC Zoom), */ 0x0A, 0x46, 0x02, /* Usage (AC Resize), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x24, 0x02, /* Usage (AC Back), */ 0x0A, 0x25, 0x02, /* Usage (AC Forward), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x30, /* Report Count (48), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m506_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m406w_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x01, 0x02, /* Usage (AC New), */ 0x09, 0x40, /* Usage (Menu), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m610x_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x79, 0x02, /* Usage (AC Redo Or Repeat), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 pensketch_m912_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x05, 0x0C, /* Usage Page (Consumer), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x01, 0x02, /* Usage (AC New), */ 0x0A, 0x2F, 0x02, /* Usage (AC Zoom), */ 0x0A, 0x25, 0x02, /* Usage (AC Forward), */ 0x0A, 0x24, 0x02, /* Usage (AC Back), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x30, /* Report Count (48), */ 0x81, 0x03, /* Input (Constant, Variable), */ 0xC0 /* End Collection */ }; static const __u8 mousepen_m508wx_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 mousepen_m508x_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x01, 0x02, /* Usage (AC New), */ 0x09, 0x40, /* Usage (Menu), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x81, 0x01, /* Input (Constant), */ 0x15, 0xFF, /* Logical Minimum (-1), */ 0x95, 0x10, /* Report Count (16), */ 0x81, 0x01, /* Input (Constant), */ 0x0A, 0x35, 0x02, /* Usage (AC Scroll), */ 0x0A, 0x2F, 0x02, /* Usage (AC Zoom), */ 0x0A, 0x38, 0x02, /* Usage (AC Pan), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x03, /* Report Count (3), */ 0x81, 0x06, /* Input (Variable, Relative), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m406xe_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x0A, 0x79, 0x02, /* Usage (AC Redo Or Repeat), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x03, /* Input (Constant, Variable), */ 0xC0 /* End Collection */ }; static const __u8 pensketch_t609a_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x37, /* Report Count (55), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; /* Fix indexes in kye_tablet_fixup if you change this */ static const __u8 kye_tablet_rdesc[] = { 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x05, /* Report ID (5), */ 0x09, 0x01, /* Usage (01h), */ 0x15, 0x81, /* Logical Minimum (-127), */ 0x25, 0x7F, /* Logical Maximum (127), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x07, /* Report Count (7), */ 0xB1, 0x02, /* Feature (Variable), */ 0xC0, /* End Collection, */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x01, /* Usage (Digitizer), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x10, /* Report ID (16), */ 0x09, 0x20, /* Usage (Stylus), */ 0xA0, /* Collection (Physical), */ 0x09, 0x42, /* Usage (Tip Switch), */ 0x09, 0x44, /* Usage (Barrel Switch), */ 0x09, 0x46, /* Usage (Tablet Pick), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x03, /* Report Count (3), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x01, /* Input (Constant), */ 0x09, 0x32, /* Usage (In Range), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x02, /* Input (Variable), */ 0x75, 0x10, /* Report Size (16), */ 0xA4, /* Push, */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x30, /* Usage (X), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x34, /* Physical Minimum (0), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x65, 0x11, /* Unit (Centimeter), */ 0x55, 0x00, /* Unit Exponent (0), */ 0x75, 0x10, /* Report Size (16), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x31, /* Usage (Y), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x81, 0x02, /* Input (Variable), */ 0xB4, /* Pop, */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x30, /* Usage (Tip Pressure), */ 0x27, 0xFF, 0x07, 0x00, 0x00, /* Logical Maximum (2047), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xC0, /* End Collection, */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x21, /* Usage (Puck), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x11, /* Report ID (17), */ 0x09, 0x21, /* Usage (Puck), */ 0xA0, /* Collection (Physical), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x03, /* Usage Maximum (03h), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x03, /* Report Count (3), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x01, /* Input (Constant), */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x32, /* Usage (In Range), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0xA4, /* Push, */ 0x09, 0x30, /* Usage (X), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x34, /* Physical Minimum (0), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x65, 0x11, /* Unit (Centimeter), */ 0x55, 0x00, /* Unit Exponent (0), */ 0x75, 0x10, /* Report Size (16), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x31, /* Usage (Y), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x81, 0x02, /* Input (Variable), */ 0xB4, /* Pop, */ 0x09, 0x38, /* Usage (Wheel), */ 0x15, 0xFF, /* Logical Minimum (-1), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x06, /* Input (Variable, Relative), */ 0x81, 0x01, /* Input (Constant), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const struct kye_tablet_info { __u32 product; __s32 x_logical_maximum; __s32 y_logical_maximum; __s32 pressure_logical_maximum; __s32 x_physical_maximum; __s32 y_physical_maximum; __s8 unit_exponent; __s8 unit; bool has_punk; unsigned int control_rsize; const __u8 *control_rdesc; } kye_tablets_info[] = { {USB_DEVICE_ID_KYE_EASYPEN_M406, /* 0x5005 */ 15360, 10240, 1023, 6, 4, 0, 0x13, false, sizeof(easypen_m406_control_rdesc), easypen_m406_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_M506, /* 0x500F */ 24576, 20480, 1023, 6, 5, 0, 0x13, false, sizeof(easypen_m506_control_rdesc), easypen_m506_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_I405X, /* 0x5010 */ 14080, 10240, 1023, 55, 40, -1, 0x13, false}, {USB_DEVICE_ID_KYE_MOUSEPEN_I608X, /* 0x5011 */ 20480, 15360, 2047, 8, 6, 0, 0x13, true}, {USB_DEVICE_ID_KYE_EASYPEN_M406W, /* 0x5012 */ 15360, 10240, 1023, 6, 4, 0, 0x13, false, sizeof(easypen_m406w_control_rdesc), easypen_m406w_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_M610X, /* 0x5013 */ 40960, 25600, 1023, 1000, 625, -2, 0x13, false, sizeof(easypen_m610x_control_rdesc), easypen_m610x_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_340, /* 0x5014 */ 10240, 7680, 1023, 4, 3, 0, 0x13, false}, {USB_DEVICE_ID_KYE_PENSKETCH_M912, /* 0x5015 */ 61440, 46080, 2047, 12, 9, 0, 0x13, true, sizeof(pensketch_m912_control_rdesc), pensketch_m912_control_rdesc}, {USB_DEVICE_ID_KYE_MOUSEPEN_M508WX, /* 0x5016 */ 40960, 25600, 2047, 8, 5, 0, 0x13, true, sizeof(mousepen_m508wx_control_rdesc), mousepen_m508wx_control_rdesc}, {USB_DEVICE_ID_KYE_MOUSEPEN_M508X, /* 0x5017 */ 40960, 25600, 2047, 8, 5, 0, 0x13, true, sizeof(mousepen_m508x_control_rdesc), mousepen_m508x_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_M406XE, /* 0x5019 */ 15360, 10240, 1023, 6, 4, 0, 0x13, false, sizeof(easypen_m406xe_control_rdesc), easypen_m406xe_control_rdesc}, {USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2, /* 0x501A */ 40960, 30720, 2047, 8, 6, 0, 0x13, true}, {USB_DEVICE_ID_KYE_PENSKETCH_T609A, /* 0x501B */ 43520, 28160, 1023, 85, 55, -1, 0x13, false, sizeof(pensketch_t609a_control_rdesc), pensketch_t609a_control_rdesc}, {} }; static __u8 *kye_consumer_control_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize, int offset, const char *device_name) { /* * the fixup that need to be done: * - change Usage Maximum in the Consumer Control * (report ID 3) to a reasonable value */ if (*rsize >= offset + 31 && /* Usage Page (Consumer Devices) */ rdesc[offset] == 0x05 && rdesc[offset + 1] == 0x0c && /* Usage (Consumer Control) */ rdesc[offset + 2] == 0x09 && rdesc[offset + 3] == 0x01 && /* Usage Maximum > 12287 */ rdesc[offset + 10] == 0x2a && rdesc[offset + 12] > 0x2f) { hid_info(hdev, "fixing up %s report descriptor\n", device_name); rdesc[offset + 12] = 0x2f; } return rdesc; } /* * Fix tablet descriptor of so-called "DataFormat 2". * * Though we may achieve a usable descriptor from original vendor-defined one, * some problems exist: * - Their Logical Maximum never exceed 32767 (7F FF), though device do report * values greater than that; * - Physical Maximums are arbitrarily filled (always equal to Logical * Maximum); * - Detail for control buttons are not provided (a vendor-defined Usage Page * with fixed content). * * Thus we use a pre-defined parameter table rather than digging it from * original descriptor. * * We may as well write a fallback routine for unrecognized kye tablet, but it's * clear kye are unlikely to produce new models in the foreseeable future, so we * simply enumerate all possible models. */ static __u8 *kye_tablet_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { const struct kye_tablet_info *info; unsigned int newsize; if (*rsize < sizeof(kye_tablet_rdesc)) { hid_warn(hdev, "tablet report size too small, or kye_tablet_rdesc unexpectedly large\n"); return rdesc; } for (info = kye_tablets_info; info->product; info++) { if (hdev->product == info->product) break; } if (!info->product) { hid_err(hdev, "tablet unknown, someone forget to add kye_tablet_info entry?\n"); return rdesc; } newsize = info->has_punk ? sizeof(kye_tablet_rdesc) : 112; memcpy(rdesc, kye_tablet_rdesc, newsize); put_unaligned_le32(info->x_logical_maximum, rdesc + 66); put_unaligned_le32(info->x_physical_maximum, rdesc + 72); rdesc[77] = info->unit; rdesc[79] = info->unit_exponent; put_unaligned_le32(info->y_logical_maximum, rdesc + 87); put_unaligned_le32(info->y_physical_maximum, rdesc + 92); put_unaligned_le32(info->pressure_logical_maximum, rdesc + 104); if (info->has_punk) { put_unaligned_le32(info->x_logical_maximum, rdesc + 156); put_unaligned_le32(info->x_physical_maximum, rdesc + 162); rdesc[167] = info->unit; rdesc[169] = info->unit_exponent; put_unaligned_le32(info->y_logical_maximum, rdesc + 177); put_unaligned_le32(info->y_physical_maximum, rdesc + 182); } if (info->control_rsize) { if (newsize + info->control_rsize > *rsize) hid_err(hdev, "control rdesc unexpectedly large"); else { memcpy(rdesc + newsize, info->control_rdesc, info->control_rsize); newsize += info->control_rsize; } } *rsize = newsize; return rdesc; } static __u8 *kye_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { switch (hdev->product) { case USB_DEVICE_ID_KYE_ERGO_525V: /* the fixups that need to be done: * - change led usage page to button for extra buttons * - report size 8 count 1 must be size 1 count 8 for button * bitfield * - change the button usage range to 4-7 for the extra * buttons */ if (*rsize >= 75 && rdesc[61] == 0x05 && rdesc[62] == 0x08 && rdesc[63] == 0x19 && rdesc[64] == 0x08 && rdesc[65] == 0x29 && rdesc[66] == 0x0f && rdesc[71] == 0x75 && rdesc[72] == 0x08 && rdesc[73] == 0x95 && rdesc[74] == 0x01) { hid_info(hdev, "fixing up Kye/Genius Ergo Mouse " "report descriptor\n"); rdesc[62] = 0x09; rdesc[64] = 0x04; rdesc[66] = 0x07; rdesc[72] = 0x01; rdesc[74] = 0x08; } break; case USB_DEVICE_ID_GENIUS_GILA_GAMING_MOUSE: rdesc = kye_consumer_control_fixup(hdev, rdesc, rsize, 104, "Genius Gila Gaming Mouse"); break; case USB_DEVICE_ID_GENIUS_MANTICORE: rdesc = kye_consumer_control_fixup(hdev, rdesc, rsize, 104, "Genius Manticore Keyboard"); break; case USB_DEVICE_ID_GENIUS_GX_IMPERATOR: rdesc = kye_consumer_control_fixup(hdev, rdesc, rsize, 83, "Genius Gx Imperator Keyboard"); break; case USB_DEVICE_ID_KYE_EASYPEN_M406: case USB_DEVICE_ID_KYE_EASYPEN_M506: case USB_DEVICE_ID_KYE_EASYPEN_I405X: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X: case USB_DEVICE_ID_KYE_EASYPEN_M406W: case USB_DEVICE_ID_KYE_EASYPEN_M610X: case USB_DEVICE_ID_KYE_EASYPEN_340: case USB_DEVICE_ID_KYE_PENSKETCH_M912: case USB_DEVICE_ID_KYE_MOUSEPEN_M508WX: case USB_DEVICE_ID_KYE_MOUSEPEN_M508X: case USB_DEVICE_ID_KYE_EASYPEN_M406XE: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2: case USB_DEVICE_ID_KYE_PENSKETCH_T609A: rdesc = kye_tablet_fixup(hdev, rdesc, rsize); break; } return rdesc; } static int kye_tablet_enable(struct hid_device *hdev) { struct list_head *list; struct list_head *head; struct hid_report *report; __s32 *value; list = &hdev->report_enum[HID_FEATURE_REPORT].report_list; list_for_each(head, list) { report = list_entry(head, struct hid_report, list); if (report->id == 5) break; } if (head == list) { hid_err(hdev, "tablet-enabling feature report not found\n"); return -ENODEV; } if (report->maxfield < 1 || report->field[0]->report_count < 7) { hid_err(hdev, "invalid tablet-enabling feature report\n"); return -ENODEV; } value = report->field[0]->value; /* * The code is for DataFormat 2 of config xml. They have no obvious * meaning (at least not configurable in Windows driver) except enabling * fully-functional tablet mode (absolute positioning). Otherwise, the * tablet acts like a relative mouse. * * Though there're magic codes for DataFormat 3 and 4, no devices use * these DataFormats. */ value[0] = 0x12; value[1] = 0x10; value[2] = 0x11; value[3] = 0x12; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hdev, report, HID_REQ_SET_REPORT); return 0; } static int kye_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } switch (id->product) { case USB_DEVICE_ID_GENIUS_MANTICORE: /* * The manticore keyboard needs to have all the interfaces * opened at least once to be fully functional. */ if (hid_hw_open(hdev)) hid_hw_close(hdev); break; case USB_DEVICE_ID_KYE_EASYPEN_M406: case USB_DEVICE_ID_KYE_EASYPEN_M506: case USB_DEVICE_ID_KYE_EASYPEN_I405X: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X: case USB_DEVICE_ID_KYE_EASYPEN_M406W: case USB_DEVICE_ID_KYE_EASYPEN_M610X: case USB_DEVICE_ID_KYE_EASYPEN_340: case USB_DEVICE_ID_KYE_PENSKETCH_M912: case USB_DEVICE_ID_KYE_MOUSEPEN_M508WX: case USB_DEVICE_ID_KYE_MOUSEPEN_M508X: case USB_DEVICE_ID_KYE_EASYPEN_M406XE: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2: case USB_DEVICE_ID_KYE_PENSKETCH_T609A: ret = kye_tablet_enable(hdev); if (ret) { hid_err(hdev, "tablet enabling failed\n"); goto enabling_err; } break; } return 0; enabling_err: hid_hw_stop(hdev); err: return ret; } static const struct hid_device_id kye_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_ERGO_525V) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_GILA_GAMING_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_MANTICORE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_GX_IMPERATOR) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M406) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M506) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_I405X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_I608X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M406W) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M610X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_340) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_PENSKETCH_M912) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_M508WX) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_M508X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M406XE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_PENSKETCH_T609A) }, { } }; MODULE_DEVICE_TABLE(hid, kye_devices); static struct hid_driver kye_driver = { .name = "kye", .id_table = kye_devices, .probe = kye_probe, .report_fixup = kye_report_fixup, }; module_hid_driver(kye_driver); MODULE_LICENSE("GPL"); |
| 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/netns/ipv6.h> #include <net/ip6_fib.h> int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifiers(net, event_type, info); } static unsigned int fib6_seq_read(struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } static int fib6_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib6_rules_dump(net, nb, extack); if (err) return err; return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { .family = AF_INET6, .fib_seq_read = fib6_seq_read, .fib_dump = fib6_dump, .owner = THIS_MODULE, }; int __net_init fib6_notifier_init(struct net *net) { struct fib_notifier_ops *ops; ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.notifier_ops = ops; return 0; } void __net_exit fib6_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.notifier_ops); } |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for EMS Trio Linker Plus II * * Copyright (c) 2010 Ignaz Forster <ignaz.forster@gmx.de> */ /* */ #include <linux/hid.h> #include <linux/input.h> #include <linux/module.h> #include "hid-ids.h" struct emsff_device { struct hid_report *report; }; static int emsff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct emsff_device *emsff = data; int weak, strong; weak = effect->u.rumble.weak_magnitude; strong = effect->u.rumble.strong_magnitude; dbg_hid("called with 0x%04x 0x%04x\n", strong, weak); weak = weak * 0xff / 0xffff; strong = strong * 0xff / 0xffff; emsff->report->field[0]->value[1] = weak; emsff->report->field[0]->value[2] = strong; dbg_hid("running with 0x%02x 0x%02x\n", strong, weak); hid_hw_request(hid, emsff->report, HID_REQ_SET_REPORT); return 0; } static int emsff_init(struct hid_device *hid) { struct emsff_device *emsff; struct hid_report *report; struct hid_input *hidinput; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct input_dev *dev; int error; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_first_entry(&hid->inputs, struct hid_input, list); dev = hidinput->input; if (list_empty(report_list)) { hid_err(hid, "no output reports found\n"); return -ENODEV; } report = list_first_entry(report_list, struct hid_report, list); if (report->maxfield < 1) { hid_err(hid, "no fields in the report\n"); return -ENODEV; } if (report->field[0]->report_count < 7) { hid_err(hid, "not enough values in the field\n"); return -ENODEV; } emsff = kzalloc(sizeof(struct emsff_device), GFP_KERNEL); if (!emsff) return -ENOMEM; set_bit(FF_RUMBLE, dev->ffbit); error = input_ff_create_memless(dev, emsff, emsff_play); if (error) { kfree(emsff); return error; } emsff->report = report; emsff->report->field[0]->value[0] = 0x01; emsff->report->field[0]->value[1] = 0x00; emsff->report->field[0]->value[2] = 0x00; emsff->report->field[0]->value[3] = 0x00; emsff->report->field[0]->value[4] = 0x00; emsff->report->field[0]->value[5] = 0x00; emsff->report->field[0]->value[6] = 0x00; hid_hw_request(hid, emsff->report, HID_REQ_SET_REPORT); hid_info(hid, "force feedback for EMS based devices by Ignaz Forster <ignaz.forster@gmx.de>\n"); return 0; } static int ems_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } ret = emsff_init(hdev); if (ret) { dev_err(&hdev->dev, "force feedback init failed\n"); hid_hw_stop(hdev); goto err; } return 0; err: return ret; } static const struct hid_device_id ems_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_EMS, USB_DEVICE_ID_EMS_TRIO_LINKER_PLUS_II) }, { } }; MODULE_DEVICE_TABLE(hid, ems_devices); static struct hid_driver ems_driver = { .name = "hkems", .id_table = ems_devices, .probe = ems_probe, }; module_hid_driver(ems_driver); MODULE_LICENSE("GPL"); |
| 12 12 12 12 45 7 29 29 29 3 29 2 29 10 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET connection oriented protocols. * * Authors: See the TCP sources */ #include <linux/module.h> #include <linux/jhash.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> #if IS_ENABLED(CONFIG_IPV6) /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses * if IPv6 only, and any IPv4 addresses * if not IPv6 only * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, const struct in6_addr *sk2_rcv_saddr6, __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk1_ipv6only, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) return true; if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return true; if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) return true; return false; } #endif /* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, inet6_rcv_saddr(sk2), sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk), ipv6_only_sock(sk2), match_wildcard, match_wildcard); #endif return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk2), match_wildcard, match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); bool inet_rcv_saddr_any(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #endif return !sk->sk_rcv_saddr; } void inet_get_local_port_range(const struct net *net, int *low, int *high) { unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); *low = net->ipv4.ip_local_ports.range[0]; *high = net->ipv4.ip_local_ports.range[1]; } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { const struct inet_sock *inet = inet_sk(sk); const struct net *net = sock_net(sk); int lo, hi, sk_lo, sk_hi; inet_get_local_port_range(net, &lo, &hi); sk_lo = inet->local_port_range.lo; sk_hi = inet->local_port_range.hi; if (unlikely(lo <= sk_lo && sk_lo <= hi)) lo = sk_lo; if (unlikely(lo <= sk_hi && sk_hi <= hi)) hi = sk_hi; *low = lo; *high = hi; } EXPORT_SYMBOL(inet_sk_get_local_port_range); static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); return addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; if (sk == sk2) return false; bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); if (!sk->sk_bound_dev_if || !bound_dev_if2 || sk->sk_bound_dev_if == bound_dev_if2) { if (sk->sk_reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(sk_uid, sock_i_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(sk_uid, sock_i_uid(sk2)))) { return true; } } return false; } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) return false; return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct inet_timewait_sock *tw2; struct sock *sk2; sk_for_each_bound_bhash2(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { sk2 = (struct sock *)tw2; if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } return false; } /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { bool reuseport_cb_ok; struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ if (!inet_use_bhash2_on_bind(sk)) { struct sock *sk2; sk_for_each_bound(sk2, &tb->owners) if (inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok) && inet_rcv_saddr_equal(sk, sk2, true)) return true; return false; } /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if * ipv4) should have been checked already. We need to do these two * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok); } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or * INADDR_ANY (if ipv4) socket. * * Caller must hold bhash hashbucket lock with local bh disabled, to protect * against concurrent binds on the port for addr any */ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { kuid_t uid = sock_i_uid((struct sock *)sk); const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; bool reuseport_cb_ok; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); head2 = inet_bhash2_addr_any_hashbucket(sk, net, port); spin_lock(&head2->lock); inet_bind_bucket_for_each(tb2, &head2->chain) if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) break; if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) { spin_unlock(&head2->lock); return true; } spin_unlock(&head2->lock); return false; } /* * Find an open port number for the socket. Returns with the * inet_bind_hashbucket locks held if successful. */ static struct inet_bind_hashbucket * inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; bool relax = false; l3mdev = inet_sk_bound_l3mdev(sk); ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); if (inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (!inet_csk_bind_conflict(sk, tb, tb2, relax, false)) goto success; spin_unlock(&head2->lock); goto next_port; } tb = NULL; goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; } return NULL; success: *port_ret = port; *tb_ret = tb; *tb2_ret = tb2; *head2_ret = head2; return head; } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; if (!uid_eq(tb->fastuid, uid)) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that * the fast_*rcv_saddr doesn't have any conflicts with the socks on the * owners list. */ if (tb->fastreuseport == FASTREUSEPORT_ANY) return 1; #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, ipv6_only_sock(sk), true, false); #endif return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, ipv6_only_sock(sk), true, false); } void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } else { tb->fastreuseport = 0; } } else { if (!reuse) tb->fastreuse = 0; if (sk->sk_reuseport) { /* We didn't match or we don't have fastreuseport set on * the tb, but we have sk_reuseport set on this socket * and we know that there are no bind conflicts with * this socket in this tb, so reset our tb's reuseport * settings so that any subsequent sockets that match * our current socket will be put on the fast path. * * If we reset we need to set FASTREUSEPORT_STRICT so we * do extra checking for all subsequent sk_reuseport * socks. */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } } else { tb->fastreuseport = 0; } } } /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); if (!head) return ret; head2_lock_acquired = true; if (tb && tb2) goto success; found_port = true; } else { head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) goto fail_unlock; bhash_created = true; } if (!found_port) { if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) check_bind_conflict = false; } if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) goto fail_unlock; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); head2_lock_acquired = true; tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); } if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, port, l3mdev, sk); if (!tb2) goto fail_unlock; bhash2_created = true; } if (!found_port && check_bind_conflict) { if (inet_csk_bind_conflict(sk, tb, tb2, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(tb, sk); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: if (ret) { if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); } if (head2_lock_acquired) spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } /* * This will accept the next outstanding connection. */ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) goto out_err; error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } req = reqsk_queue_remove(queue, sk); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + atomic_read(&newsk->sk_rmem_alloc)); } if (amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt, GFP_KERNEL | __GFP_NOFAIL); release_sock(newsk); } if (req) reqsk_put(req); return newsk; out_err: newsk = NULL; req = NULL; *err = error; goto out; } EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *t), void (*delack_handler)(struct timer_list *t), void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct ip_options_rcu *opt; struct rtable *rt; rcu_read_lock(); opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; struct flowi4 *fl4; struct rtable *rt; opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; route_err: ip_rt_put(rt); no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ static void syn_ack_recalc(struct request_sock *req, const int max_syn_ack_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } *expire = req->num_timeout >= max_syn_ack_retries && (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; } int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) req->num_retrans++; return err; } EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { struct sock *req_sk, *nreq_sk; struct request_sock *nreq; nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!nreq) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ sock_put(sk); return NULL; } req_sk = req_to_sk(req); nreq_sk = req_to_sk(nreq); memcpy(nreq_sk, req_sk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); sk_node_init(&nreq_sk->sk_node); nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; #endif nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; nreq->rsk_listener = sk; /* We need not acquire fastopenq->lock * because the child socket is locked in inet_csk_listen_stop(). */ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); return nreq; } static void reqsk_queue_migrated(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static void reqsk_migrate_reset(struct request_sock *req) { req->saved_syn = NULL; #if IS_ENABLED(CONFIG_IPV6) inet_rsk(req)->ipv6_opt = NULL; inet_rsk(req)->pktopts = NULL; #else inet_rsk(req)->ireq_opt = NULL; #endif } /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { struct sock *sk = req_to_sk(req); bool found = false; if (sk_hashed(sk)) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { bool unlinked = reqsk_queue_unlink(req); if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } return unlinked; } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk; struct request_sock_queue *queue; struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { struct sock *nsk; nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); if (!nsk) goto drop; nreq = inet_reqsk_clone(req, nsk); if (!nreq) goto drop; /* The new timer for the cloned req can decrease the 2 * by calling inet_csk_reqsk_queue_drop_and_put(), so * hold another count to prevent use-after-free and * call reqsk_put() just before return. */ refcount_set(&nreq->rsk_refcnt, 2 + 1); timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); req = nreq; sk_listener = nsk; } icsk = inet_csk(sk_listener); net = sock_net(sk_listener); max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (max_syn_ack_retries > 2) { if (qlen < young) break; max_syn_ack_retries--; young <<= 1; } } syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || !inet_rtx_syn_ack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); if (!nreq) return; if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ inet_csk_reqsk_queue_drop(sk_listener, nreq); goto no_ownership; } __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(oreq); reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); reqsk_put(oreq); reqsk_put(nreq); return; } /* Even if we can clone the req, we may need not retransmit any more * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another * CPU may win the "own_req" race so that inet_ehash_insert() fails. */ if (nreq) { __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); no_ownership: reqsk_migrate_reset(nreq); reqsk_queue_removed(queue, nreq); __reqsk_free(nreq); } drop: inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); } void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { reqsk_queue_hash_req(req, timeout); inet_csk_reqsk_queue_added(sk); } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); if (!icsk->icsk_ulp_ops) return; icsk->icsk_ulp_ops->clone(req, newsk, priority); } /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone * @req: request_sock * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); inet_sk(newsk)->mc_list = NULL; newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); inet_clone_ulp(req, newsk, priority); security_inet_csk_clone(newsk, req); } return newsk; } EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this * socket, and thus no user references at all. Therefore we * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ void inet_csk_destroy_sock(struct sock *sk) { WARN_ON(sk->sk_state != TCP_CLOSE); WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); static int inet_ulp_can_listen(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) return -EINVAL; return 0; } int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; err = inet_ulp_can_listen(sk); if (unlikely(err)) return err; reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); err = sk->sk_prot->get_port(sk, inet->inet_num); if (!err) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); err = sk->sk_prot->hash(sk); if (likely(!err)) return 0; } inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) { sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); this_cpu_inc(*sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is * blocked by sock lock in tcp_v4_rcv(). * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); child = NULL; } else { req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { inet_csk_reqsk_queue_drop(req->rsk_listener, req); reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); if (sk != req->rsk_listener) { /* another listening sk has been selected, * migrate the req to it. */ struct request_sock *nreq; /* hold a refcnt for the nreq->rsk_listener * which is assigned in inet_reqsk_clone() */ sock_hold(sk); nreq = inet_reqsk_clone(req, sk); if (!nreq) { inet_child_forget(sk, req, child); goto child_put; } refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(sk, nreq, child)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); reqsk_put(req); return child; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; } } /* Too bad, another child took ownership of the request, undo. */ child_put: bh_unlock_sock(child); sock_put(child); return NULL; } EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) * or to send active reset (abort). * Certainly, it is pretty dangerous while synflood, but it is * bad justification for our negligence 8) * To be honest, we are not able to make either * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk, *nsk; struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); nsk = reuseport_migrate_sock(sk, child, NULL); if (nsk) { nreq = inet_reqsk_clone(req, nsk); if (nreq) { refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); } else { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } /* inet_csk_reqsk_queue_add() has already * called inet_child_forget() on failure case. */ goto skip_child_forget; } } inet_child_forget(sk, req, child); skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); while (req != NULL) { next = req->dl_next; reqsk_put(req); req = next; } } WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; sin->sin_addr.s_addr = inet->inet_daddr; sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; fl4 = &fl->u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) rt = NULL; if (rt) sk_setup_caps(sk, &rt->dst); rcu_read_unlock(); return &rt->dst; } struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) { struct dst_entry *dst = __sk_dst_check(sk, 0); struct inet_sock *inet = inet_sk(sk); if (!dst) { dst = inet_csk_rebuild_route(sk, &inet->cork.fl); if (!dst) goto out; } dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) dst = inet_csk_rebuild_route(sk, &inet->cork.fl); out: return dst; } EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); |
| 2984 2985 2984 2983 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 | // SPDX-License-Identifier: GPL-2.0 /* * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ #include <linux/bpf-cgroup.h> #include <linux/device_cgroup.h> #include <linux/cgroup.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #ifdef CONFIG_CGROUP_DEVICE static DEFINE_MUTEX(devcgroup_mutex); enum devcg_behavior { DEVCG_DEFAULT_NONE, DEVCG_DEFAULT_ALLOW, DEVCG_DEFAULT_DENY, }; /* * exception list locking rules: * hold devcgroup_mutex for update/read. * hold rcu_read_lock() for read. */ struct dev_exception_item { u32 major, minor; short type; short access; struct list_head list; struct rcu_head rcu; }; struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; enum devcg_behavior behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_cgrp_id)); } /* * called under devcgroup_mutex */ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp, *new; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry(ex, orig, list) { new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!new) goto free_and_exit; list_add_tail(&new->list, dest); } return 0; free_and_exit: list_for_each_entry_safe(ex, tmp, dest, list) { list_del(&ex->list); kfree(ex); } return -ENOMEM; } static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(ex, tmp, orig, list) { list_move_tail(&ex->list, dest); } } /* * called under devcgroup_mutex */ static int dev_exception_add(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *excopy, *walk; lockdep_assert_held(&devcgroup_mutex); excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!excopy) return -ENOMEM; list_for_each_entry(walk, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access |= ex->access; kfree(excopy); excopy = NULL; } if (excopy != NULL) list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); return 0; } /* * called under devcgroup_mutex */ static void dev_exception_rm(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *walk, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access &= ~ex->access; if (!walk->access) { list_del_rcu(&walk->list); kfree_rcu(walk, rcu); } } } static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) { struct dev_exception_item *ex, *tmp; list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { list_del_rcu(&ex->list); kfree_rcu(ex, rcu); } } /** * dev_exception_clean - frees all entries of the exception list * @dev_cgroup: dev_cgroup with the exception list to be cleaned * * called under devcgroup_mutex */ static void dev_exception_clean(struct dev_cgroup *dev_cgroup) { lockdep_assert_held(&devcgroup_mutex); __dev_exception_clean(dev_cgroup); } static inline bool is_devcg_online(const struct dev_cgroup *devcg) { return (devcg->behavior != DEVCG_DEFAULT_NONE); } /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's * @css: css getting online * returns 0 in case of success, error code otherwise */ static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); if (!ret) dev_cgroup->behavior = parent_dev_cgroup->behavior; } mutex_unlock(&devcgroup_mutex); return ret; } static void devcgroup_offline(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; mutex_unlock(&devcgroup_mutex); } /* * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); if (!dev_cgroup) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev_cgroup->exceptions); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; return &dev_cgroup->css; } static void devcgroup_css_free(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } #define DEVCG_ALLOW 1 #define DEVCG_DENY 2 #define DEVCG_LIST 3 #define MAJMINLEN 13 #define ACCLEN 4 static void set_access(char *acc, short access) { int idx = 0; memset(acc, 0, ACCLEN); if (access & DEVCG_ACC_READ) acc[idx++] = 'r'; if (access & DEVCG_ACC_WRITE) acc[idx++] = 'w'; if (access & DEVCG_ACC_MKNOD) acc[idx++] = 'm'; } static char type_to_char(short type) { if (type == DEVCG_DEV_ALL) return 'a'; if (type == DEVCG_DEV_CHAR) return 'c'; if (type == DEVCG_DEV_BLOCK) return 'b'; return 'X'; } static void set_majmin(char *str, unsigned m) { if (m == ~0) strcpy(str, "*"); else sprintf(str, "%u", m); } static int devcgroup_seq_show(struct seq_file *m, void *v) { struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; rcu_read_lock(); /* * To preserve the compatibility: * - Only show the "all devices" when the default policy is to allow * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, DEVCG_ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL), maj, min, acc); } else { list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { set_access(acc, ex->access); set_majmin(maj, ex->major); set_majmin(min, ex->minor); seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), maj, min, acc); } } rcu_read_unlock(); return 0; } /** * match_exception - iterates the exception list trying to find a complete match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a complete match if an exception is found that will * contain the entire range of provided parameters. * * Return: true in case it matches an exception completely */ static bool match_exception(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; if (ex->major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && ex->minor != minor) continue; /* provided access cannot have more than the exception rule */ if (access & (~ex->access)) continue; return true; } return false; } /** * match_exception_partial - iterates the exception list trying to find a partial match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a partial match if an exception's range is found to * contain *any* of the devices specified by provided parameters. This is * used to make sure no extra access is being granted that is forbidden by * any of the exception list. * * Return: true in case the provided range mat matches an exception completely */ static bool match_exception_partial(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list, lockdep_is_held(&devcgroup_mutex)) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; /* * We must be sure that both the exception and the provided * range aren't masking all devices */ if (ex->major != ~0 && major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; /* * In order to make sure the provided range isn't matching * an exception, all its access bits shouldn't match the * exception's access bits */ if (!(access & ex->access)) continue; return true; } return false; } /** * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions * @dev_cgroup: dev cgroup to be tested against * @refex: new exception * @behavior: behavior of the exception's dev_cgroup * * This is used to make sure a child cgroup won't have more privileges * than its parent */ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, struct dev_exception_item *refex, enum devcg_behavior behavior) { bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { /* * new exception in the child doesn't matter, only * adding extra restrictions */ return true; } else { /* * new exception in the child will add more devices * that can be accessed, so it can't match any of * parent's exceptions, even slightly */ match = match_exception_partial(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) return false; return true; } } else { /* * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore * the new exception will add access to more devices and must * be contained completely in an parent's exception to be * allowed */ match = match_exception(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) /* parent has an exception that matches the proposed */ return true; else return false; } return false; } /* * parent_has_perm: * when adding a new allow rule to a device exception list, the rule * must be allowed in the parent device */ static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; return verify_new_ex(parent, ex, childcg->behavior); } /** * parent_allows_removal - verify if it's ok to remove an exception * @childcg: child cgroup from where the exception will be removed * @ex: exception being removed * * When removing an exception in cgroups with default ALLOW policy, it must * be checked if removing it will give the child cgroup more access than the * parent. * * Return: true if it's ok to remove exception, false otherwise */ static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; /* It's always allowed to remove access to devices */ if (childcg->behavior == DEVCG_DEFAULT_DENY) return true; /* * Make sure you're not removing part or a whole exception existing in * the parent cgroup */ return !match_exception_partial(&parent->exceptions, ex->type, ex->major, ex->minor, ex->access); } /** * may_allow_all - checks if it's possible to change the behavior to * allow based on parent's rules. * @parent: device cgroup's parent * returns: != 0 in case it's allowed, 0 otherwise */ static inline int may_allow_all(struct dev_cgroup *parent) { if (!parent) return 1; return parent->behavior == DEVCG_DEFAULT_ALLOW; } /** * revalidate_active_exceptions - walks through the active exception list and * revalidates the exceptions based on parent's * behavior and exceptions. The exceptions that * are no longer valid will be removed. * Called with devcgroup_mutex held. * @devcg: cgroup which exceptions will be checked * * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { struct dev_exception_item *ex; struct list_head *this, *tmp; list_for_each_safe(this, tmp, &devcg->exceptions) { ex = container_of(this, struct dev_exception_item, list); if (!parent_has_perm(devcg, ex)) dev_exception_rm(devcg, ex); } } /** * propagate_exception - propagates a new exception to the children * @devcg_root: device cgroup that added a new exception * @ex: new exception to be propagated * * returns: 0 in case of success, != 0 in case of error */ static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); css_for_each_descendant_pre(pos, &devcg_root->css) { struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become * online or offline during the tree walk (see on/offline * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); /* * in case both root's behavior and devcg is allow, a new * restriction means adding to the exception list */ if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) return rc; } else { /* * in the other possible cases: * root's behavior: allow, devcg's: deny * root's behavior: deny, devcg's: deny * the exception will be removed */ dev_exception_rm(devcg, ex); } revalidate_active_exceptions(devcg); rcu_read_lock(); } rcu_read_unlock(); return rc; } /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the exception list. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); struct dev_cgroup tmp_devcgrp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(&ex, 0, sizeof(ex)); memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); b = buffer; switch (*b) { case 'a': switch (filetype) { case DEVCG_ALLOW: if (css_has_online_children(&devcgroup->css)) return -EINVAL; if (!may_allow_all(parent)) return -EPERM; if (!parent) { devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(devcgroup); break; } INIT_LIST_HEAD(&tmp_devcgrp.exceptions); rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, &devcgroup->exceptions); if (rc) return rc; dev_exception_clean(devcgroup); rc = dev_exceptions_copy(&devcgroup->exceptions, &parent->exceptions); if (rc) { dev_exceptions_move(&devcgroup->exceptions, &tmp_devcgrp.exceptions); return rc; } devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(&tmp_devcgrp); break; case DEVCG_DENY: if (css_has_online_children(&devcgroup->css)) return -EINVAL; dev_exception_clean(devcgroup); devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; } return 0; case 'b': ex.type = DEVCG_DEV_BLOCK; break; case 'c': ex.type = DEVCG_DEV_CHAR; break; default: return -EINVAL; } b++; if (!isspace(*b)) return -EINVAL; b++; if (*b == '*') { ex.major = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.major); if (rc) return -EINVAL; } else { return -EINVAL; } if (*b != ':') return -EINVAL; b++; /* read minor */ if (*b == '*') { ex.minor = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.minor); if (rc) return -EINVAL; } else { return -EINVAL; } if (!isspace(*b)) return -EINVAL; for (b++, count = 0; count < 3; count++, b++) { switch (*b) { case 'r': ex.access |= DEVCG_ACC_READ; break; case 'w': ex.access |= DEVCG_ACC_WRITE; break; case 'm': ex.access |= DEVCG_ACC_MKNOD; break; case '\n': case '\0': count = 3; break; default: return -EINVAL; } } switch (filetype) { case DEVCG_ALLOW: /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { /* Check if the parent allows removing it first */ if (!parent_allows_removal(devcgroup, &ex)) return -EPERM; dev_exception_rm(devcgroup, &ex); break; } if (!parent_has_perm(devcgroup, &ex)) return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: /* * If the default policy is to deny by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_DENY) dev_exception_rm(devcgroup, &ex); else rc = dev_exception_add(devcgroup, &ex); if (rc) break; /* we only propagate new restrictions */ rc = propagate_exception(devcgroup, &ex); break; default: rc = -EINVAL; } return rc; } static ssize_t devcgroup_access_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", .write = devcgroup_access_write, .private = DEVCG_DENY, }, { .name = "list", .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ }; struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, .legacy_cftypes = dev_cgroup_files, }; /** * devcgroup_legacy_check_permission - checks if an inode operation is permitted * @type: device type * @major: device major number * @minor: device minor number * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD * * returns 0 on success, -EPERM case the operation is not permitted */ static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) /* Can't match any of the exceptions, even partially */ rc = !match_exception_partial(&dev_cgroup->exceptions, type, major, minor, access); else /* Need to match completely one exception to be allowed */ rc = match_exception(&dev_cgroup->exceptions, type, major, minor, access); rcu_read_unlock(); if (!rc) return -EPERM; return 0; } #endif /* CONFIG_CGROUP_DEVICE */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); #else /* CONFIG_CGROUP_DEVICE */ return 0; #endif /* CONFIG_CGROUP_DEVICE */ } EXPORT_SYMBOL(devcgroup_check_permission); #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */ |
| 172 172 172 171 171 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-only /* * lib/hexdump.c */ #include <linux/types.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/minmax.h> #include <linux/export.h> #include <asm/unaligned.h> const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); const char hex_asc_upper[] = "0123456789ABCDEF"; EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value * @ch: ascii character represents hex digit * * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad * input. * * This function is used to load cryptographic keys, so it is coded in such a * way that there are no conditions or memory accesses that depend on data. * * Explanation of the logic: * (ch - '9' - 1) is negative if ch <= '9' * ('0' - 1 - ch) is negative if ch >= '0' * we "and" these two values, so the result is negative if ch is in the range * '0' ... '9' * we are only interested in the sign, so we do a shift ">> 8"; note that right * shift of a negative value is implementation-defined, so we cast the * value to (unsigned) before the shift --- we have 0xffffff if ch is in * the range '0' ... '9', 0 otherwise * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is * in the range '0' ... '9', 0 otherwise * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0' * ... '9', -1 otherwise * the next line is similar to the previous one, but we need to decode both * uppercase and lowercase letters, so we use (ch & 0xdf), which converts * lowercase to uppercase */ int hex_to_bin(unsigned char ch) { unsigned char cu = ch & 0xdf; return -1 + ((ch - '0' + 1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) + ((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8); } EXPORT_SYMBOL(hex_to_bin); /** * hex2bin - convert an ascii hexadecimal string to its binary representation * @dst: binary result * @src: ascii hexadecimal string * @count: result length * * Return 0 on success, -EINVAL in case of bad input. */ int hex2bin(u8 *dst, const char *src, size_t count) { while (count--) { int hi, lo; hi = hex_to_bin(*src++); if (unlikely(hi < 0)) return -EINVAL; lo = hex_to_bin(*src++); if (unlikely(lo < 0)) return -EINVAL; *dst++ = (hi << 4) | lo; } return 0; } EXPORT_SYMBOL(hex2bin); /** * bin2hex - convert binary data to an ascii hexadecimal string * @dst: ascii hexadecimal result * @src: binary data * @count: binary data length */ char *bin2hex(char *dst, const void *src, size_t count) { const unsigned char *_src = src; while (count--) dst = hex_byte_pack(dst, *_src++); return dst; } EXPORT_SYMBOL(bin2hex); /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump * @len: number of bytes in the @buf * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @linebuf: where to put the converted data * @linebuflen: total size of @linebuf, including space for terminating NUL * @ascii: include ASCII after the hex output * * hex_dump_to_buffer() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data * to a hex + ASCII dump at the supplied memory location. * The converted output is always NUL-terminated. * * E.g.: * hex_dump_to_buffer(frame->data, frame->len, 16, 1, * linebuf, sizeof(linebuf), true); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * * Return: * The amount of bytes placed in the buffer without terminating NUL. If the * output was truncated, then the return value is the number of bytes * (excluding the terminating NUL) which would have been written to the final * string if enough space had been available. */ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; int ngroups; u8 ch; int j, lx = 0; int ascii_column; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; if (len > rowsize) /* limit to one line at a time */ len = rowsize; if (!is_power_of_2(groupsize) || groupsize > 8) groupsize = 1; if ((len % groupsize) != 0) /* no mixed size output */ groupsize = 1; ngroups = len / groupsize; ascii_column = rowsize * 2 + rowsize / groupsize + 1; if (!linebuflen) goto overflow1; if (!len) goto nil; if (groupsize == 8) { const u64 *ptr8 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%16.16llx", j ? " " : "", get_unaligned(ptr8 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 4) { const u32 *ptr4 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%8.8x", j ? " " : "", get_unaligned(ptr4 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 2) { const u16 *ptr2 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%4.4x", j ? " " : "", get_unaligned(ptr2 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else { for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = hex_asc_lo(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } if (j) lx--; } if (!ascii) goto nil; while (lx < ascii_column) { if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: linebuf[lx] = '\0'; return lx; overflow2: linebuf[lx++] = '\0'; overflow1: return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); #ifdef CONFIG_PRINTK /** * print_hex_dump - print a text hex dump to syslog for a binary blob of data * @level: kernel log level (e.g. KERN_DEBUG) * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump * to the kernel log at the specified kernel log level, with an optional * leading prefix. * * print_hex_dump() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * print_hex_dump() iterates over the entire input @buf, breaking it into * "line size" chunks to format and print. * * E.g.: * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, * 16, 1, frame->data, frame->len, true); * * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode: * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. */ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: printk("%s%s%p: %s\n", level, prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); break; default: printk("%s%s%s\n", level, prefix_str, linebuf); break; } } } EXPORT_SYMBOL(print_hex_dump); #endif /* defined(CONFIG_PRINTK) */ |
| 142 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Regents of the University of California */ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/signal.h> #include <linux/signal.h> #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/kprobes.h> #include <linux/uprobes.h> #include <asm/uprobes.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/irq.h> #include <linux/kexec.h> #include <linux/entry-common.h> #include <asm/asm-prototypes.h> #include <asm/bug.h> #include <asm/cfi.h> #include <asm/csr.h> #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/syscall.h> #include <asm/thread_info.h> #include <asm/vector.h> #include <asm/irq_stack.h> int show_unhandled_signals = 1; static DEFINE_SPINLOCK(die_lock); static void dump_kernel_instr(const char *loglvl, struct pt_regs *regs) { char str[sizeof("0000 ") * 12 + 2 + 1], *p = str; const u16 *insns = (u16 *)instruction_pointer(regs); long bad; u16 val; int i; for (i = -10; i < 2; i++) { bad = get_kernel_nofault(val, &insns[i]); if (!bad) { p += sprintf(p, i == 0 ? "(%04hx) " : "%04hx ", val); } else { printk("%sCode: Unable to access instruction at 0x%px.\n", loglvl, &insns[i]); return; } } printk("%sCode: %s\n", loglvl, str); } void die(struct pt_regs *regs, const char *str) { static int die_counter; int ret; long cause; unsigned long flags; oops_enter(); spin_lock_irqsave(&die_lock, flags); console_verbose(); bust_spinlocks(1); pr_emerg("%s [#%d]\n", str, ++die_counter); print_modules(); if (regs) { show_regs(regs); dump_kernel_instr(KERN_EMERG, regs); } cause = regs ? regs->cause : -1; ret = notify_die(DIE_OOPS, str, regs, 0, cause, SIGSEGV); if (kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&die_lock, flags); oops_exit(); if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); if (ret != NOTIFY_STOP) make_task_dead(SIGSEGV); } void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) { struct task_struct *tsk = current; if (show_unhandled_signals && unhandled_signal(tsk, signo) && printk_ratelimit()) { pr_info("%s[%d]: unhandled signal %d code 0x%x at 0x" REG_FMT, tsk->comm, task_pid_nr(tsk), signo, code, addr); print_vma_addr(KERN_CONT " in ", instruction_pointer(regs)); pr_cont("\n"); __show_regs(regs); } force_sig_fault(signo, code, (void __user *)addr); } static void do_trap_error(struct pt_regs *regs, int signo, int code, unsigned long addr, const char *str) { current->thread.bad_cause = regs->cause; if (user_mode(regs)) { do_trap(regs, signo, code, addr); } else { if (!fixup_exception(regs)) die(regs, str); } } #if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_RISCV_ALTERNATIVE) #define __trap_section __noinstr_section(".xip.traps") #else #define __trap_section noinstr #endif #define DO_ERROR_INFO(name, signo, code, str) \ asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ { \ if (user_mode(regs)) { \ irqentry_enter_from_user_mode(regs); \ do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ irqentry_exit_to_user_mode(regs); \ } else { \ irqentry_state_t state = irqentry_nmi_enter(regs); \ do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ irqentry_nmi_exit(regs, state); \ } \ } DO_ERROR_INFO(do_trap_unknown, SIGILL, ILL_ILLTRP, "unknown exception"); DO_ERROR_INFO(do_trap_insn_misaligned, SIGBUS, BUS_ADRALN, "instruction address misaligned"); DO_ERROR_INFO(do_trap_insn_fault, SIGSEGV, SEGV_ACCERR, "instruction access fault"); asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *regs) { bool handled; if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); local_irq_enable(); handled = riscv_v_first_use_handler(regs); local_irq_disable(); if (!handled) do_trap_error(regs, SIGILL, ILL_ILLOPC, regs->epc, "Oops - illegal instruction"); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t state = irqentry_nmi_enter(regs); do_trap_error(regs, SIGILL, ILL_ILLOPC, regs->epc, "Oops - illegal instruction"); irqentry_nmi_exit(regs, state); } } DO_ERROR_INFO(do_trap_load_fault, SIGSEGV, SEGV_ACCERR, "load access fault"); #ifndef CONFIG_RISCV_M_MODE DO_ERROR_INFO(do_trap_load_misaligned, SIGBUS, BUS_ADRALN, "Oops - load address misaligned"); DO_ERROR_INFO(do_trap_store_misaligned, SIGBUS, BUS_ADRALN, "Oops - store (or AMO) address misaligned"); #else int handle_misaligned_load(struct pt_regs *regs); int handle_misaligned_store(struct pt_regs *regs); asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) { if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); if (handle_misaligned_load(regs)) do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, "Oops - load address misaligned"); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t state = irqentry_nmi_enter(regs); if (handle_misaligned_load(regs)) do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, "Oops - load address misaligned"); irqentry_nmi_exit(regs, state); } } asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) { if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); if (handle_misaligned_store(regs)) do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, "Oops - store (or AMO) address misaligned"); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t state = irqentry_nmi_enter(regs); if (handle_misaligned_store(regs)) do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, "Oops - store (or AMO) address misaligned"); irqentry_nmi_exit(regs, state); } } #endif DO_ERROR_INFO(do_trap_store_fault, SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault"); DO_ERROR_INFO(do_trap_ecall_s, SIGILL, ILL_ILLTRP, "environment call from S-mode"); DO_ERROR_INFO(do_trap_ecall_m, SIGILL, ILL_ILLTRP, "environment call from M-mode"); static inline unsigned long get_break_insn_length(unsigned long pc) { bug_insn_t insn; if (get_kernel_nofault(insn, (bug_insn_t *)pc)) return 0; return GET_INSN_LENGTH(insn); } static bool probe_single_step_handler(struct pt_regs *regs) { bool user = user_mode(regs); return user ? uprobe_single_step_handler(regs) : kprobe_single_step_handler(regs); } static bool probe_breakpoint_handler(struct pt_regs *regs) { bool user = user_mode(regs); return user ? uprobe_breakpoint_handler(regs) : kprobe_breakpoint_handler(regs); } void handle_break(struct pt_regs *regs) { if (probe_single_step_handler(regs)) return; if (probe_breakpoint_handler(regs)) return; current->thread.bad_cause = regs->cause; if (user_mode(regs)) force_sig_fault(SIGTRAP, TRAP_BRKPT, (void __user *)regs->epc); #ifdef CONFIG_KGDB else if (notify_die(DIE_TRAP, "EBREAK", regs, 0, regs->cause, SIGTRAP) == NOTIFY_STOP) return; #endif else if (report_bug(regs->epc, regs) == BUG_TRAP_TYPE_WARN || handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) regs->epc += get_break_insn_length(regs->epc); else die(regs, "Kernel BUG"); } asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) { if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); handle_break(regs); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t state = irqentry_nmi_enter(regs); handle_break(regs); irqentry_nmi_exit(regs, state); } } asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs) { if (user_mode(regs)) { long syscall = regs->a7; regs->epc += 4; regs->orig_a0 = regs->a0; riscv_v_vstate_discard(regs); syscall = syscall_enter_from_user_mode(regs, syscall); if (syscall >= 0 && syscall < NR_syscalls) syscall_handler(regs, syscall); else if (syscall != -1) regs->a0 = -ENOSYS; syscall_exit_to_user_mode(regs); } else { irqentry_state_t state = irqentry_nmi_enter(regs); do_trap_error(regs, SIGILL, ILL_ILLTRP, regs->epc, "Oops - environment call from U-mode"); irqentry_nmi_exit(regs, state); } } #ifdef CONFIG_MMU asmlinkage __visible noinstr void do_page_fault(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); handle_page_fault(regs); local_irq_disable(); irqentry_exit(regs, state); } #endif static void noinstr handle_riscv_irq(struct pt_regs *regs) { struct pt_regs *old_regs; irq_enter_rcu(); old_regs = set_irq_regs(regs); handle_arch_irq(regs); set_irq_regs(old_regs); irq_exit_rcu(); } asmlinkage void noinstr do_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); #ifdef CONFIG_IRQ_STACKS if (on_thread_stack()) { ulong *sp = per_cpu(irq_stack_ptr, smp_processor_id()) + IRQ_STACK_SIZE/sizeof(ulong); __asm__ __volatile( "addi sp, sp, -"RISCV_SZPTR "\n" REG_S" ra, (sp) \n" "addi sp, sp, -"RISCV_SZPTR "\n" REG_S" s0, (sp) \n" "addi s0, sp, 2*"RISCV_SZPTR "\n" "move sp, %[sp] \n" "move a0, %[regs] \n" "call handle_riscv_irq \n" "addi sp, s0, -2*"RISCV_SZPTR"\n" REG_L" s0, (sp) \n" "addi sp, sp, "RISCV_SZPTR "\n" REG_L" ra, (sp) \n" "addi sp, sp, "RISCV_SZPTR "\n" : : [sp] "r" (sp), [regs] "r" (regs) : "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6", #ifndef CONFIG_FRAME_POINTER "s0", #endif "memory"); } else #endif handle_riscv_irq(regs); irqentry_exit(regs, state); } #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long pc) { bug_insn_t insn; if (pc < VMALLOC_START) return 0; if (get_kernel_nofault(insn, (bug_insn_t *)pc)) return 0; if ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) return (insn == __BUG_INSN_32); else return ((insn & __COMPRESSED_INSN_MASK) == __BUG_INSN_16); } #endif /* CONFIG_GENERIC_BUG */ #ifdef CONFIG_VMAP_STACK /* * Extra stack space that allows us to provide panic messages when the kernel * has overflowed its stack. */ static DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)__aligned(16); /* * A temporary stack for use by handle_kernel_stack_overflow. This is used so * we can call into C code to get the per-hart overflow stack. Usage of this * stack must be protected by spin_shadow_stack. */ long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE/sizeof(long)] __aligned(16); /* * A pseudo spinlock to protect the shadow stack from being used by multiple * harts concurrently. This isn't a real spinlock because the lock side must * be taken without a valid stack and only a single register, it's only taken * while in the process of panicing anyway so the performance and error * checking a proper spinlock gives us doesn't matter. */ unsigned long spin_shadow_stack; asmlinkage unsigned long get_overflow_stack(void) { return (unsigned long)this_cpu_ptr(overflow_stack) + OVERFLOW_STACK_SIZE; } asmlinkage void handle_bad_stack(struct pt_regs *regs) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); /* * We're done with the shadow stack by this point, as we're on the * overflow stack. Tell any other concurrent overflowing harts that * they can proceed with panicing by releasing the pseudo-spinlock. * * This pairs with an amoswap.aq in handle_kernel_stack_overflow. */ smp_store_release(&spin_shadow_stack, 0); console_verbose(); pr_emerg("Insufficient stack space to handle exception!\n"); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); __show_regs(regs); panic("Kernel stack overflow"); for (;;) wait_for_interrupt(); } #endif |
| 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 | /************************************************************************** * * Copyright 2006-2008 Tungsten Graphics, Inc., Cedar Park, TX. USA. * Copyright 2016 Intel Corporation * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * * **************************************************************************/ /* * Authors: * Thomas Hellstrom <thomas-at-tungstengraphics-dot-com> */ #ifndef _DRM_MM_H_ #define _DRM_MM_H_ /* * Generic range manager structs */ #include <linux/bug.h> #include <linux/rbtree.h> #include <linux/limits.h> #include <linux/mm_types.h> #include <linux/list.h> #include <linux/spinlock.h> #ifdef CONFIG_DRM_DEBUG_MM #include <linux/stackdepot.h> #endif #include <linux/types.h> #include <drm/drm_print.h> #ifdef CONFIG_DRM_DEBUG_MM #define DRM_MM_BUG_ON(expr) BUG_ON(expr) #else #define DRM_MM_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) #endif /** * enum drm_mm_insert_mode - control search and allocation behaviour * * The &struct drm_mm range manager supports finding a suitable modes using * a number of search trees. These trees are oranised by size, by address and * in most recent eviction order. This allows the user to find either the * smallest hole to reuse, the lowest or highest address to reuse, or simply * reuse the most recent eviction that fits. When allocating the &drm_mm_node * from within the hole, the &drm_mm_insert_mode also dictate whether to * allocate the lowest matching address or the highest. */ enum drm_mm_insert_mode { /** * @DRM_MM_INSERT_BEST: * * Search for the smallest hole (within the search range) that fits * the desired node. * * Allocates the node from the bottom of the found hole. */ DRM_MM_INSERT_BEST = 0, /** * @DRM_MM_INSERT_LOW: * * Search for the lowest hole (address closest to 0, within the search * range) that fits the desired node. * * Allocates the node from the bottom of the found hole. */ DRM_MM_INSERT_LOW, /** * @DRM_MM_INSERT_HIGH: * * Search for the highest hole (address closest to U64_MAX, within the * search range) that fits the desired node. * * Allocates the node from the *top* of the found hole. The specified * alignment for the node is applied to the base of the node * (&drm_mm_node.start). */ DRM_MM_INSERT_HIGH, /** * @DRM_MM_INSERT_EVICT: * * Search for the most recently evicted hole (within the search range) * that fits the desired node. This is appropriate for use immediately * after performing an eviction scan (see drm_mm_scan_init()) and * removing the selected nodes to form a hole. * * Allocates the node from the bottom of the found hole. */ DRM_MM_INSERT_EVICT, /** * @DRM_MM_INSERT_ONCE: * * Only check the first hole for suitablity and report -ENOSPC * immediately otherwise, rather than check every hole until a * suitable one is found. Can only be used in conjunction with another * search method such as DRM_MM_INSERT_HIGH or DRM_MM_INSERT_LOW. */ DRM_MM_INSERT_ONCE = BIT(31), /** * @DRM_MM_INSERT_HIGHEST: * * Only check the highest hole (the hole with the largest address) and * insert the node at the top of the hole or report -ENOSPC if * unsuitable. * * Does not search all holes. */ DRM_MM_INSERT_HIGHEST = DRM_MM_INSERT_HIGH | DRM_MM_INSERT_ONCE, /** * @DRM_MM_INSERT_LOWEST: * * Only check the lowest hole (the hole with the smallest address) and * insert the node at the bottom of the hole or report -ENOSPC if * unsuitable. * * Does not search all holes. */ DRM_MM_INSERT_LOWEST = DRM_MM_INSERT_LOW | DRM_MM_INSERT_ONCE, }; /** * struct drm_mm_node - allocated block in the DRM allocator * * This represents an allocated block in a &drm_mm allocator. Except for * pre-reserved nodes inserted using drm_mm_reserve_node() the structure is * entirely opaque and should only be accessed through the provided funcions. * Since allocation of these nodes is entirely handled by the driver they can be * embedded. */ struct drm_mm_node { /** @color: Opaque driver-private tag. */ unsigned long color; /** @start: Start address of the allocated block. */ u64 start; /** @size: Size of the allocated block. */ u64 size; /* private: */ struct drm_mm *mm; struct list_head node_list; struct list_head hole_stack; struct rb_node rb; struct rb_node rb_hole_size; struct rb_node rb_hole_addr; u64 __subtree_last; u64 hole_size; u64 subtree_max_hole; unsigned long flags; #define DRM_MM_NODE_ALLOCATED_BIT 0 #define DRM_MM_NODE_SCANNED_BIT 1 #ifdef CONFIG_DRM_DEBUG_MM depot_stack_handle_t stack; #endif }; /** * struct drm_mm - DRM allocator * * DRM range allocator with a few special functions and features geared towards * managing GPU memory. Except for the @color_adjust callback the structure is * entirely opaque and should only be accessed through the provided functions * and macros. This structure can be embedded into larger driver structures. */ struct drm_mm { /** * @color_adjust: * * Optional driver callback to further apply restrictions on a hole. The * node argument points at the node containing the hole from which the * block would be allocated (see drm_mm_hole_follows() and friends). The * other arguments are the size of the block to be allocated. The driver * can adjust the start and end as needed to e.g. insert guard pages. */ void (*color_adjust)(const struct drm_mm_node *node, unsigned long color, u64 *start, u64 *end); /* private: */ /* List of all memory nodes that immediately precede a free hole. */ struct list_head hole_stack; /* head_node.node_list is the list of all memory nodes, ordered * according to the (increasing) start address of the memory node. */ struct drm_mm_node head_node; /* Keep an interval_tree for fast lookup of drm_mm_nodes by address. */ struct rb_root_cached interval_tree; struct rb_root_cached holes_size; struct rb_root holes_addr; unsigned long scan_active; }; /** * struct drm_mm_scan - DRM allocator eviction roaster data * * This structure tracks data needed for the eviction roaster set up using * drm_mm_scan_init(), and used with drm_mm_scan_add_block() and * drm_mm_scan_remove_block(). The structure is entirely opaque and should only * be accessed through the provided functions and macros. It is meant to be * allocated temporarily by the driver on the stack. */ struct drm_mm_scan { /* private: */ struct drm_mm *mm; u64 size; u64 alignment; u64 remainder_mask; u64 range_start; u64 range_end; u64 hit_start; u64 hit_end; unsigned long color; enum drm_mm_insert_mode mode; }; /** * drm_mm_node_allocated - checks whether a node is allocated * @node: drm_mm_node to check * * Drivers are required to clear a node prior to using it with the * drm_mm range manager. * * Drivers should use this helper for proper encapsulation of drm_mm * internals. * * Returns: * True if the @node is allocated. */ static inline bool drm_mm_node_allocated(const struct drm_mm_node *node) { return test_bit(DRM_MM_NODE_ALLOCATED_BIT, &node->flags); } /** * drm_mm_initialized - checks whether an allocator is initialized * @mm: drm_mm to check * * Drivers should clear the struct drm_mm prior to initialisation if they * want to use this function. * * Drivers should use this helper for proper encapsulation of drm_mm * internals. * * Returns: * True if the @mm is initialized. */ static inline bool drm_mm_initialized(const struct drm_mm *mm) { return READ_ONCE(mm->hole_stack.next); } /** * drm_mm_hole_follows - checks whether a hole follows this node * @node: drm_mm_node to check * * Holes are embedded into the drm_mm using the tail of a drm_mm_node. * If you wish to know whether a hole follows this particular node, * query this function. See also drm_mm_hole_node_start() and * drm_mm_hole_node_end(). * * Returns: * True if a hole follows the @node. */ static inline bool drm_mm_hole_follows(const struct drm_mm_node *node) { return node->hole_size; } static inline u64 __drm_mm_hole_node_start(const struct drm_mm_node *hole_node) { return hole_node->start + hole_node->size; } /** * drm_mm_hole_node_start - computes the start of the hole following @node * @hole_node: drm_mm_node which implicitly tracks the following hole * * This is useful for driver-specific debug dumpers. Otherwise drivers should * not inspect holes themselves. Drivers must check first whether a hole indeed * follows by looking at drm_mm_hole_follows() * * Returns: * Start of the subsequent hole. */ static inline u64 drm_mm_hole_node_start(const struct drm_mm_node *hole_node) { DRM_MM_BUG_ON(!drm_mm_hole_follows(hole_node)); return __drm_mm_hole_node_start(hole_node); } static inline u64 __drm_mm_hole_node_end(const struct drm_mm_node *hole_node) { return list_next_entry(hole_node, node_list)->start; } /** * drm_mm_hole_node_end - computes the end of the hole following @node * @hole_node: drm_mm_node which implicitly tracks the following hole * * This is useful for driver-specific debug dumpers. Otherwise drivers should * not inspect holes themselves. Drivers must check first whether a hole indeed * follows by looking at drm_mm_hole_follows(). * * Returns: * End of the subsequent hole. */ static inline u64 drm_mm_hole_node_end(const struct drm_mm_node *hole_node) { return __drm_mm_hole_node_end(hole_node); } /** * drm_mm_nodes - list of nodes under the drm_mm range manager * @mm: the struct drm_mm range manager * * As the drm_mm range manager hides its node_list deep with its * structure, extracting it looks painful and repetitive. This is * not expected to be used outside of the drm_mm_for_each_node() * macros and similar internal functions. * * Returns: * The node list, may be empty. */ #define drm_mm_nodes(mm) (&(mm)->head_node.node_list) /** * drm_mm_for_each_node - iterator to walk over all allocated nodes * @entry: &struct drm_mm_node to assign to in each iteration step * @mm: &drm_mm allocator to walk * * This iterator walks over all nodes in the range allocator. It is implemented * with list_for_each(), so not save against removal of elements. */ #define drm_mm_for_each_node(entry, mm) \ list_for_each_entry(entry, drm_mm_nodes(mm), node_list) /** * drm_mm_for_each_node_safe - iterator to walk over all allocated nodes * @entry: &struct drm_mm_node to assign to in each iteration step * @next: &struct drm_mm_node to store the next step * @mm: &drm_mm allocator to walk * * This iterator walks over all nodes in the range allocator. It is implemented * with list_for_each_safe(), so save against removal of elements. */ #define drm_mm_for_each_node_safe(entry, next, mm) \ list_for_each_entry_safe(entry, next, drm_mm_nodes(mm), node_list) /** * drm_mm_for_each_hole - iterator to walk over all holes * @pos: &drm_mm_node used internally to track progress * @mm: &drm_mm allocator to walk * @hole_start: ulong variable to assign the hole start to on each iteration * @hole_end: ulong variable to assign the hole end to on each iteration * * This iterator walks over all holes in the range allocator. It is implemented * with list_for_each(), so not save against removal of elements. @entry is used * internally and will not reflect a real drm_mm_node for the very first hole. * Hence users of this iterator may not access it. * * Implementation Note: * We need to inline list_for_each_entry in order to be able to set hole_start * and hole_end on each iteration while keeping the macro sane. */ #define drm_mm_for_each_hole(pos, mm, hole_start, hole_end) \ for (pos = list_first_entry(&(mm)->hole_stack, \ typeof(*pos), hole_stack); \ &pos->hole_stack != &(mm)->hole_stack ? \ hole_start = drm_mm_hole_node_start(pos), \ hole_end = hole_start + pos->hole_size, \ 1 : 0; \ pos = list_next_entry(pos, hole_stack)) /* * Basic range manager support (drm_mm.c) */ int drm_mm_reserve_node(struct drm_mm *mm, struct drm_mm_node *node); int drm_mm_insert_node_in_range(struct drm_mm *mm, struct drm_mm_node *node, u64 size, u64 alignment, unsigned long color, u64 start, u64 end, enum drm_mm_insert_mode mode); /** * drm_mm_insert_node_generic - search for space and insert @node * @mm: drm_mm to allocate from * @node: preallocate node to insert * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for this node * @mode: fine-tune the allocation search and placement * * This is a simplified version of drm_mm_insert_node_in_range() with no * range restrictions applied. * * The preallocated node must be cleared to 0. * * Returns: * 0 on success, -ENOSPC if there's no suitable hole. */ static inline int drm_mm_insert_node_generic(struct drm_mm *mm, struct drm_mm_node *node, u64 size, u64 alignment, unsigned long color, enum drm_mm_insert_mode mode) { return drm_mm_insert_node_in_range(mm, node, size, alignment, color, 0, U64_MAX, mode); } /** * drm_mm_insert_node - search for space and insert @node * @mm: drm_mm to allocate from * @node: preallocate node to insert * @size: size of the allocation * * This is a simplified version of drm_mm_insert_node_generic() with @color set * to 0. * * The preallocated node must be cleared to 0. * * Returns: * 0 on success, -ENOSPC if there's no suitable hole. */ static inline int drm_mm_insert_node(struct drm_mm *mm, struct drm_mm_node *node, u64 size) { return drm_mm_insert_node_generic(mm, node, size, 0, 0, 0); } void drm_mm_remove_node(struct drm_mm_node *node); void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new); void drm_mm_init(struct drm_mm *mm, u64 start, u64 size); void drm_mm_takedown(struct drm_mm *mm); /** * drm_mm_clean - checks whether an allocator is clean * @mm: drm_mm allocator to check * * Returns: * True if the allocator is completely free, false if there's still a node * allocated in it. */ static inline bool drm_mm_clean(const struct drm_mm *mm) { return list_empty(drm_mm_nodes(mm)); } struct drm_mm_node * __drm_mm_interval_first(const struct drm_mm *mm, u64 start, u64 last); /** * drm_mm_for_each_node_in_range - iterator to walk over a range of * allocated nodes * @node__: drm_mm_node structure to assign to in each iteration step * @mm__: drm_mm allocator to walk * @start__: starting offset, the first node will overlap this * @end__: ending offset, the last node will start before this (but may overlap) * * This iterator walks over all nodes in the range allocator that lie * between @start and @end. It is implemented similarly to list_for_each(), * but using the internal interval tree to accelerate the search for the * starting node, and so not safe against removal of elements. It assumes * that @end is within (or is the upper limit of) the drm_mm allocator. * If [@start, @end] are beyond the range of the drm_mm, the iterator may walk * over the special _unallocated_ &drm_mm.head_node, and may even continue * indefinitely. */ #define drm_mm_for_each_node_in_range(node__, mm__, start__, end__) \ for (node__ = __drm_mm_interval_first((mm__), (start__), (end__)-1); \ node__->start < (end__); \ node__ = list_next_entry(node__, node_list)) void drm_mm_scan_init_with_range(struct drm_mm_scan *scan, struct drm_mm *mm, u64 size, u64 alignment, unsigned long color, u64 start, u64 end, enum drm_mm_insert_mode mode); /** * drm_mm_scan_init - initialize lru scanning * @scan: scan state * @mm: drm_mm to scan * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for the allocation * @mode: fine-tune the allocation search and placement * * This is a simplified version of drm_mm_scan_init_with_range() with no range * restrictions applied. * * This simply sets up the scanning routines with the parameters for the desired * hole. * * Warning: * As long as the scan list is non-empty, no other operations than * adding/removing nodes to/from the scan list are allowed. */ static inline void drm_mm_scan_init(struct drm_mm_scan *scan, struct drm_mm *mm, u64 size, u64 alignment, unsigned long color, enum drm_mm_insert_mode mode) { drm_mm_scan_init_with_range(scan, mm, size, alignment, color, 0, U64_MAX, mode); } bool drm_mm_scan_add_block(struct drm_mm_scan *scan, struct drm_mm_node *node); bool drm_mm_scan_remove_block(struct drm_mm_scan *scan, struct drm_mm_node *node); struct drm_mm_node *drm_mm_scan_color_evict(struct drm_mm_scan *scan); void drm_mm_print(const struct drm_mm *mm, struct drm_printer *p); #endif |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | // SPDX-License-Identifier: GPL-2.0-or-later /* * GeneSys GL620USB-A based links * Copyright (C) 2001 by Jiun-Jie Huang <huangjj@genesyslogic.com.tw> * Copyright (C) 2001 by Stanislav Brabec <utx@penguin.cz> */ // #define DEBUG // error path messages, extra info // #define VERBOSE // more; success messages #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/gfp.h> /* * GeneSys GL620USB-A (www.genesyslogic.com.tw) * * ... should partially interop with the Win32 driver for this hardware. * The GeneSys docs imply there's some NDIS issue motivating this framing. * * Some info from GeneSys: * - GL620USB-A is full duplex; GL620USB is only half duplex for bulk. * (Some cables, like the BAFO-100c, use the half duplex version.) * - For the full duplex model, the low bit of the version code says * which side is which ("left/right"). * - For the half duplex type, a control/interrupt handshake settles * the transfer direction. (That's disabled here, partially coded.) * A control URB would block until other side writes an interrupt. * * Original code from Jiun-Jie Huang <huangjj@genesyslogic.com.tw> * and merged into "usbnet" by Stanislav Brabec <utx@penguin.cz>. */ // control msg write command #define GENELINK_CONNECT_WRITE 0xF0 // interrupt pipe index #define GENELINK_INTERRUPT_PIPE 0x03 // interrupt read buffer size #define INTERRUPT_BUFSIZE 0x08 // interrupt pipe interval value #define GENELINK_INTERRUPT_INTERVAL 0x10 // max transmit packet number per transmit #define GL_MAX_TRANSMIT_PACKETS 32 // max packet length #define GL_MAX_PACKET_LEN 1514 // max receive buffer size #define GL_RCV_BUF_SIZE \ (((GL_MAX_PACKET_LEN + 4) * GL_MAX_TRANSMIT_PACKETS) + 4) struct gl_packet { __le32 packet_length; char packet_data[]; }; struct gl_header { __le32 packet_count; struct gl_packet packets; }; static int genelink_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { struct gl_header *header; struct gl_packet *packet; struct sk_buff *gl_skb; u32 size; u32 count; /* This check is no longer done by usbnet */ if (skb->len < dev->net->hard_header_len) return 0; header = (struct gl_header *) skb->data; // get the packet count of the received skb count = le32_to_cpu(header->packet_count); if (count > GL_MAX_TRANSMIT_PACKETS) { netdev_dbg(dev->net, "genelink: invalid received packet count %u\n", count); return 0; } // set the current packet pointer to the first packet packet = &header->packets; // decrement the length for the packet count size 4 bytes skb_pull(skb, 4); while (count > 1) { // get the packet length size = le32_to_cpu(packet->packet_length); // this may be a broken packet if (size > GL_MAX_PACKET_LEN) { netdev_dbg(dev->net, "genelink: invalid rx length %d\n", size); return 0; } // allocate the skb for the individual packet gl_skb = alloc_skb(size, GFP_ATOMIC); if (gl_skb) { // copy the packet data to the new skb skb_put_data(gl_skb, packet->packet_data, size); usbnet_skb_return(dev, gl_skb); } // advance to the next packet packet = (struct gl_packet *)&packet->packet_data[size]; count--; // shift the data pointer to the next gl_packet skb_pull(skb, size + 4); } // skip the packet length field 4 bytes skb_pull(skb, 4); if (skb->len > GL_MAX_PACKET_LEN) { netdev_dbg(dev->net, "genelink: invalid rx length %d\n", skb->len); return 0; } return 1; } static struct sk_buff * genelink_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; int length = skb->len; int headroom = skb_headroom(skb); int tailroom = skb_tailroom(skb); __le32 *packet_count; __le32 *packet_len; // FIXME: magic numbers, bleech padlen = ((skb->len + (4 + 4*1)) % 64) ? 0 : 1; if ((!skb_cloned(skb)) && ((headroom + tailroom) >= (padlen + (4 + 4*1)))) { if ((headroom < (4 + 4*1)) || (tailroom < padlen)) { skb->data = memmove(skb->head + (4 + 4*1), skb->data, skb->len); skb_set_tail_pointer(skb, skb->len); } } else { struct sk_buff *skb2; skb2 = skb_copy_expand(skb, (4 + 4*1) , padlen, flags); dev_kfree_skb_any(skb); skb = skb2; if (!skb) return NULL; } // attach the packet count to the header packet_count = skb_push(skb, (4 + 4 * 1)); packet_len = packet_count + 1; *packet_count = cpu_to_le32(1); *packet_len = cpu_to_le32(length); // add padding byte if ((skb->len % dev->maxpacket) == 0) skb_put(skb, 1); return skb; } static int genelink_bind(struct usbnet *dev, struct usb_interface *intf) { dev->hard_mtu = GL_RCV_BUF_SIZE; dev->net->hard_header_len += 4; dev->in = usb_rcvbulkpipe(dev->udev, dev->driver_info->in); dev->out = usb_sndbulkpipe(dev->udev, dev->driver_info->out); return 0; } static const struct driver_info genelink_info = { .description = "Genesys GeneLink", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_GL | FLAG_NO_SETINT, .bind = genelink_bind, .rx_fixup = genelink_rx_fixup, .tx_fixup = genelink_tx_fixup, .in = 1, .out = 2, #ifdef GENELINK_ACK .check_connect =genelink_check_connect, #endif }; static const struct usb_device_id products [] = { { USB_DEVICE(0x05e3, 0x0502), // GL620USB-A .driver_info = (unsigned long) &genelink_info, }, /* NOT: USB_DEVICE(0x05e3, 0x0501), // GL620USB * that's half duplex, not currently supported */ { }, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver gl620a_driver = { .name = "gl620a", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(gl620a_driver); MODULE_AUTHOR("Jiun-Jie Huang"); MODULE_DESCRIPTION("GL620-USB-A Host-to-Host Link cables"); MODULE_LICENSE("GPL"); |
| 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | // SPDX-License-Identifier: GPL-2.0-only /* * llc_output.c - LLC minimal output path * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/llc.h> #include <net/llc_pdu.h> /** * llc_mac_hdr_init - fills MAC header fields * @skb: Address of the frame to initialize its MAC header * @sa: The MAC source address * @da: The MAC destination address * * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type * is a valid type and initialization completes correctly 1, otherwise. */ int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da) { int rc = -EINVAL; switch (skb->dev->type) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa, skb->len); if (rc > 0) rc = 0; break; default: break; } return rc; } /** * llc_build_and_send_ui_pkt - unitdata request interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * Upper layers calls this function when upper layer wants to send data * using connection-less mode communication (UI pdu). * * Accept data frame from network layer to be sent using connection- * less mode communication; timeout/retries handled by network layer; * package primitive as an event and send to SAP event handler */ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap) { int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); if (likely(!rc)) rc = dev_queue_xmit(skb); else kfree_skb(skb); return rc; } EXPORT_SYMBOL(llc_mac_hdr_init); EXPORT_SYMBOL(llc_build_and_send_ui_pkt); |
| 43 43 43 14 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Red Hat */ #include <linux/module.h> #include <drm/drm_drv.h> #include <drm/drm_fbdev_generic.h> #include <drm/drm_file.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_managed.h> #include <drm/drm_modeset_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_probe_helper.h> #include <drm/drm_print.h> #include "udl_drv.h" static int udl_usb_suspend(struct usb_interface *interface, pm_message_t message) { struct drm_device *dev = usb_get_intfdata(interface); int ret; ret = drm_mode_config_helper_suspend(dev); if (ret) return ret; udl_sync_pending_urbs(dev); return 0; } static int udl_usb_resume(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); return drm_mode_config_helper_resume(dev); } static int udl_usb_reset_resume(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); struct udl_device *udl = to_udl(dev); udl_select_std_channel(udl); return drm_mode_config_helper_resume(dev); } /* * FIXME: Dma-buf sharing requires DMA support by the importing device. * This function is a workaround to make USB devices work as well. * See todo.rst for how to fix the issue in the dma-buf framework. */ static struct drm_gem_object *udl_driver_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf) { struct udl_device *udl = to_udl(dev); if (!udl->dmadev) return ERR_PTR(-ENODEV); return drm_gem_prime_import_dev(dev, dma_buf, udl->dmadev); } DEFINE_DRM_GEM_FOPS(udl_driver_fops); static const struct drm_driver driver = { .driver_features = DRIVER_ATOMIC | DRIVER_GEM | DRIVER_MODESET, /* GEM hooks */ .fops = &udl_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, .gem_prime_import = udl_driver_gem_prime_import, .name = DRIVER_NAME, .desc = DRIVER_DESC, .date = DRIVER_DATE, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, .patchlevel = DRIVER_PATCHLEVEL, }; static struct udl_device *udl_driver_create(struct usb_interface *interface) { struct udl_device *udl; int r; udl = devm_drm_dev_alloc(&interface->dev, &driver, struct udl_device, drm); if (IS_ERR(udl)) return udl; r = udl_init(udl); if (r) return ERR_PTR(r); usb_set_intfdata(interface, udl); return udl; } static int udl_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { int r; struct udl_device *udl; udl = udl_driver_create(interface); if (IS_ERR(udl)) return PTR_ERR(udl); r = drm_dev_register(&udl->drm, 0); if (r) return r; DRM_INFO("Initialized udl on minor %d\n", udl->drm.primary->index); drm_fbdev_generic_setup(&udl->drm, 0); return 0; } static void udl_usb_disconnect(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); drm_kms_helper_poll_fini(dev); udl_drop_usb(dev); drm_dev_unplug(dev); } /* * There are many DisplayLink-based graphics products, all with unique PIDs. * So we match on DisplayLink's VID + Vendor-Defined Interface Class (0xff) * We also require a match on SubClass (0x00) and Protocol (0x00), * which is compatible with all known USB 2.0 era graphics chips and firmware, * but allows DisplayLink to increment those for any future incompatible chips */ static const struct usb_device_id id_table[] = { {.idVendor = 0x17e9, .bInterfaceClass = 0xff, .bInterfaceSubClass = 0x00, .bInterfaceProtocol = 0x00, .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_INT_CLASS | USB_DEVICE_ID_MATCH_INT_SUBCLASS | USB_DEVICE_ID_MATCH_INT_PROTOCOL,}, {}, }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_driver udl_driver = { .name = "udl", .probe = udl_usb_probe, .disconnect = udl_usb_disconnect, .suspend = udl_usb_suspend, .resume = udl_usb_resume, .reset_resume = udl_usb_reset_resume, .id_table = id_table, }; module_usb_driver(udl_driver); MODULE_LICENSE("GPL"); |
| 42 42 42 19 19 42 29 30 30 27 29 5 3 1 5 3 3 4 2 3 1 1 1 30 26 19 19 12 5 12 7 2 7 19 4 19 19 19 3 1 2 2 3 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ASIX AX8817X based USB 2.0 Ethernet Devices * Copyright (C) 2003-2006 David Hollis <dhollis@davehollis.com> * Copyright (C) 2005 Phil Chang <pchang23@sbcglobal.net> * Copyright (C) 2006 James Painter <jamie.painter@iname.com> * Copyright (c) 2002-2003 TiVo Inc. */ #include "asix.h" #define AX_HOST_EN_RETRIES 30 int __must_check asix_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index, u16 size, void *data, int in_pm) { int ret; int (*fn)(struct usbnet *, u8, u8, u16, u16, void *, u16); BUG_ON(!dev); if (!in_pm) fn = usbnet_read_cmd; else fn = usbnet_read_cmd_nopm; ret = fn(dev, cmd, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, data, size); if (unlikely(ret < size)) { ret = ret < 0 ? ret : -ENODATA; netdev_warn(dev->net, "Failed to read reg index 0x%04x: %d\n", index, ret); } return ret; } int asix_write_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index, u16 size, void *data, int in_pm) { int ret; int (*fn)(struct usbnet *, u8, u8, u16, u16, const void *, u16); BUG_ON(!dev); if (!in_pm) fn = usbnet_write_cmd; else fn = usbnet_write_cmd_nopm; ret = fn(dev, cmd, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, data, size); if (unlikely(ret < 0)) netdev_warn(dev->net, "Failed to write reg index 0x%04x: %d\n", index, ret); return ret; } void asix_write_cmd_async(struct usbnet *dev, u8 cmd, u16 value, u16 index, u16 size, void *data) { usbnet_write_cmd_async(dev, cmd, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, data, size); } static int asix_set_sw_mii(struct usbnet *dev, int in_pm) { int ret; ret = asix_write_cmd(dev, AX_CMD_SET_SW_MII, 0x0000, 0, 0, NULL, in_pm); if (ret < 0) netdev_err(dev->net, "Failed to enable software MII access\n"); return ret; } static int asix_set_hw_mii(struct usbnet *dev, int in_pm) { int ret; ret = asix_write_cmd(dev, AX_CMD_SET_HW_MII, 0x0000, 0, 0, NULL, in_pm); if (ret < 0) netdev_err(dev->net, "Failed to enable hardware MII access\n"); return ret; } static int asix_check_host_enable(struct usbnet *dev, int in_pm) { int i, ret; u8 smsr; for (i = 0; i < AX_HOST_EN_RETRIES; ++i) { ret = asix_set_sw_mii(dev, in_pm); if (ret == -ENODEV || ret == -ETIMEDOUT) break; usleep_range(1000, 1100); ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, 0, 0, 1, &smsr, in_pm); if (ret == -ENODEV) break; else if (ret < 0) continue; else if (smsr & AX_HOST_EN) break; } return i >= AX_HOST_EN_RETRIES ? -ETIMEDOUT : ret; } static void reset_asix_rx_fixup_info(struct asix_rx_fixup_info *rx) { /* Reset the variables that have a lifetime outside of * asix_rx_fixup_internal() so that future processing starts from a * known set of initial conditions. */ if (rx->ax_skb) { /* Discard any incomplete Ethernet frame in the netdev buffer */ kfree_skb(rx->ax_skb); rx->ax_skb = NULL; } /* Assume the Data header 32-bit word is at the start of the current * or next URB socket buffer so reset all the state variables. */ rx->remaining = 0; rx->split_head = false; rx->header = 0; } int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, struct asix_rx_fixup_info *rx) { int offset = 0; u16 size; /* When an Ethernet frame spans multiple URB socket buffers, * do a sanity test for the Data header synchronisation. * Attempt to detect the situation of the previous socket buffer having * been truncated or a socket buffer was missing. These situations * cause a discontinuity in the data stream and therefore need to avoid * appending bad data to the end of the current netdev socket buffer. * Also avoid unnecessarily discarding a good current netdev socket * buffer. */ if (rx->remaining && (rx->remaining + sizeof(u32) <= skb->len)) { offset = ((rx->remaining + 1) & 0xfffe); rx->header = get_unaligned_le32(skb->data + offset); offset = 0; size = (u16)(rx->header & 0x7ff); if (size != ((~rx->header >> 16) & 0x7ff)) { netdev_err(dev->net, "asix_rx_fixup() Data Header synchronisation was lost, remaining %d\n", rx->remaining); reset_asix_rx_fixup_info(rx); } } while (offset + sizeof(u16) <= skb->len) { u16 copy_length; if (!rx->remaining) { if (skb->len - offset == sizeof(u16)) { rx->header = get_unaligned_le16( skb->data + offset); rx->split_head = true; offset += sizeof(u16); break; } if (rx->split_head == true) { rx->header |= (get_unaligned_le16( skb->data + offset) << 16); rx->split_head = false; offset += sizeof(u16); } else { rx->header = get_unaligned_le32(skb->data + offset); offset += sizeof(u32); } /* take frame length from Data header 32-bit word */ size = (u16)(rx->header & 0x7ff); if (size != ((~rx->header >> 16) & 0x7ff)) { netdev_err(dev->net, "asix_rx_fixup() Bad Header Length 0x%x, offset %d\n", rx->header, offset); reset_asix_rx_fixup_info(rx); return 0; } if (size > dev->net->mtu + ETH_HLEN + VLAN_HLEN) { netdev_dbg(dev->net, "asix_rx_fixup() Bad RX Length %d\n", size); reset_asix_rx_fixup_info(rx); return 0; } /* Sometimes may fail to get a netdev socket buffer but * continue to process the URB socket buffer so that * synchronisation of the Ethernet frame Data header * word is maintained. */ rx->ax_skb = netdev_alloc_skb_ip_align(dev->net, size); rx->remaining = size; } if (rx->remaining > skb->len - offset) { copy_length = skb->len - offset; rx->remaining -= copy_length; } else { copy_length = rx->remaining; rx->remaining = 0; } if (rx->ax_skb) { skb_put_data(rx->ax_skb, skb->data + offset, copy_length); if (!rx->remaining) { usbnet_skb_return(dev, rx->ax_skb); rx->ax_skb = NULL; } } offset += (copy_length + 1) & 0xfffe; } if (skb->len != offset) { netdev_err(dev->net, "asix_rx_fixup() Bad SKB Length %d, %d\n", skb->len, offset); reset_asix_rx_fixup_info(rx); return 0; } return 1; } int asix_rx_fixup_common(struct usbnet *dev, struct sk_buff *skb) { struct asix_common_private *dp = dev->driver_priv; struct asix_rx_fixup_info *rx = &dp->rx_fixup_info; return asix_rx_fixup_internal(dev, skb, rx); } void asix_rx_fixup_common_free(struct asix_common_private *dp) { struct asix_rx_fixup_info *rx; if (!dp) return; rx = &dp->rx_fixup_info; if (rx->ax_skb) { kfree_skb(rx->ax_skb); rx->ax_skb = NULL; } } struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; int headroom = skb_headroom(skb); int tailroom = skb_tailroom(skb); u32 packet_len; u32 padbytes = 0xffff0000; void *ptr; padlen = ((skb->len + 4) & (dev->maxpacket - 1)) ? 0 : 4; /* We need to push 4 bytes in front of frame (packet_len) * and maybe add 4 bytes after the end (if padlen is 4) * * Avoid skb_copy_expand() expensive call, using following rules : * - We are allowed to push 4 bytes in headroom if skb_header_cloned() * is false (and if we have 4 bytes of headroom) * - We are allowed to put 4 bytes at tail if skb_cloned() * is false (and if we have 4 bytes of tailroom) * * TCP packets for example are cloned, but __skb_header_release() * was called in tcp stack, allowing us to use headroom for our needs. */ if (!skb_header_cloned(skb) && !(padlen && skb_cloned(skb)) && headroom + tailroom >= 4 + padlen) { /* following should not happen, but better be safe */ if (headroom < 4 || tailroom < padlen) { skb->data = memmove(skb->head + 4, skb->data, skb->len); skb_set_tail_pointer(skb, skb->len); } } else { struct sk_buff *skb2; skb2 = skb_copy_expand(skb, 4, padlen, flags); dev_kfree_skb_any(skb); skb = skb2; if (!skb) return NULL; } packet_len = ((skb->len ^ 0x0000ffff) << 16) + skb->len; ptr = skb_push(skb, 4); put_unaligned_le32(packet_len, ptr); if (padlen) { put_unaligned_le32(padbytes, skb_tail_pointer(skb)); skb_put(skb, sizeof(padbytes)); } usbnet_set_skb_tx_stats(skb, 1, 0); return skb; } int asix_read_phy_addr(struct usbnet *dev, bool internal) { int ret, offset; u8 buf[2]; ret = asix_read_cmd(dev, AX_CMD_READ_PHY_ID, 0, 0, 2, buf, 0); if (ret < 0) goto error; if (ret < 2) { ret = -EIO; goto error; } offset = (internal ? 1 : 0); ret = buf[offset]; netdev_dbg(dev->net, "%s PHY address 0x%x\n", internal ? "internal" : "external", ret); return ret; error: netdev_err(dev->net, "Error reading PHY_ID register: %02x\n", ret); return ret; } int asix_sw_reset(struct usbnet *dev, u8 flags, int in_pm) { int ret; ret = asix_write_cmd(dev, AX_CMD_SW_RESET, flags, 0, 0, NULL, in_pm); if (ret < 0) netdev_err(dev->net, "Failed to send software reset: %02x\n", ret); return ret; } u16 asix_read_rx_ctl(struct usbnet *dev, int in_pm) { __le16 v; int ret = asix_read_cmd(dev, AX_CMD_READ_RX_CTL, 0, 0, 2, &v, in_pm); if (ret < 0) { netdev_err(dev->net, "Error reading RX_CTL register: %02x\n", ret); goto out; } ret = le16_to_cpu(v); out: return ret; } int asix_write_rx_ctl(struct usbnet *dev, u16 mode, int in_pm) { int ret; netdev_dbg(dev->net, "asix_write_rx_ctl() - mode = 0x%04x\n", mode); ret = asix_write_cmd(dev, AX_CMD_WRITE_RX_CTL, mode, 0, 0, NULL, in_pm); if (ret < 0) netdev_err(dev->net, "Failed to write RX_CTL mode to 0x%04x: %02x\n", mode, ret); return ret; } u16 asix_read_medium_status(struct usbnet *dev, int in_pm) { __le16 v; int ret = asix_read_cmd(dev, AX_CMD_READ_MEDIUM_STATUS, 0, 0, 2, &v, in_pm); if (ret < 0) { netdev_err(dev->net, "Error reading Medium Status register: %02x\n", ret); return ret; /* TODO: callers not checking for error ret */ } return le16_to_cpu(v); } int asix_write_medium_mode(struct usbnet *dev, u16 mode, int in_pm) { int ret; netdev_dbg(dev->net, "asix_write_medium_mode() - mode = 0x%04x\n", mode); ret = asix_write_cmd(dev, AX_CMD_WRITE_MEDIUM_MODE, mode, 0, 0, NULL, in_pm); if (ret < 0) netdev_err(dev->net, "Failed to write Medium Mode mode to 0x%04x: %02x\n", mode, ret); return ret; } /* set MAC link settings according to information from phylib */ void asix_adjust_link(struct net_device *netdev) { struct phy_device *phydev = netdev->phydev; struct usbnet *dev = netdev_priv(netdev); u16 mode = 0; if (phydev->link) { mode = AX88772_MEDIUM_DEFAULT; if (phydev->duplex == DUPLEX_HALF) mode &= ~AX_MEDIUM_FD; if (phydev->speed != SPEED_100) mode &= ~AX_MEDIUM_PS; } asix_write_medium_mode(dev, mode, 0); phy_print_status(phydev); usbnet_link_change(dev, phydev->link, 0); } int asix_write_gpio(struct usbnet *dev, u16 value, int sleep, int in_pm) { int ret; netdev_dbg(dev->net, "asix_write_gpio() - value = 0x%04x\n", value); ret = asix_write_cmd(dev, AX_CMD_WRITE_GPIOS, value, 0, 0, NULL, in_pm); if (ret < 0) netdev_err(dev->net, "Failed to write GPIO value 0x%04x: %02x\n", value, ret); if (sleep) msleep(sleep); return ret; } /* * AX88772 & AX88178 have a 16-bit RX_CTL value */ void asix_set_multicast(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct asix_data *data = (struct asix_data *)&dev->data; u16 rx_ctl = AX_DEFAULT_RX_CTL; if (net->flags & IFF_PROMISC) { rx_ctl |= AX_RX_CTL_PRO; } else if (net->flags & IFF_ALLMULTI || netdev_mc_count(net) > AX_MAX_MCAST) { rx_ctl |= AX_RX_CTL_AMALL; } else if (netdev_mc_empty(net)) { /* just broadcast and directed */ } else { /* We use the 20 byte dev->data * for our 8 byte filter buffer * to avoid allocating memory that * is tricky to free later */ struct netdev_hw_addr *ha; u32 crc_bits; memset(data->multi_filter, 0, AX_MCAST_FILTER_SIZE); /* Build the multicast hash filter. */ netdev_for_each_mc_addr(ha, net) { crc_bits = ether_crc(ETH_ALEN, ha->addr) >> 26; data->multi_filter[crc_bits >> 3] |= 1 << (crc_bits & 7); } asix_write_cmd_async(dev, AX_CMD_WRITE_MULTI_FILTER, 0, 0, AX_MCAST_FILTER_SIZE, data->multi_filter); rx_ctl |= AX_RX_CTL_AM; } asix_write_cmd_async(dev, AX_CMD_WRITE_RX_CTL, rx_ctl, 0, 0, NULL); } static int __asix_mdio_read(struct net_device *netdev, int phy_id, int loc, bool in_pm) { struct usbnet *dev = netdev_priv(netdev); __le16 res; int ret; mutex_lock(&dev->phy_mutex); ret = asix_check_host_enable(dev, in_pm); if (ret == -ENODEV || ret == -ETIMEDOUT) { mutex_unlock(&dev->phy_mutex); return ret; } ret = asix_read_cmd(dev, AX_CMD_READ_MII_REG, phy_id, (__u16)loc, 2, &res, in_pm); if (ret < 0) goto out; ret = asix_set_hw_mii(dev, in_pm); out: mutex_unlock(&dev->phy_mutex); netdev_dbg(dev->net, "asix_mdio_read() phy_id=0x%02x, loc=0x%02x, returns=0x%04x\n", phy_id, loc, le16_to_cpu(res)); return ret < 0 ? ret : le16_to_cpu(res); } int asix_mdio_read(struct net_device *netdev, int phy_id, int loc) { return __asix_mdio_read(netdev, phy_id, loc, false); } static int __asix_mdio_write(struct net_device *netdev, int phy_id, int loc, int val, bool in_pm) { struct usbnet *dev = netdev_priv(netdev); __le16 res = cpu_to_le16(val); int ret; netdev_dbg(dev->net, "asix_mdio_write() phy_id=0x%02x, loc=0x%02x, val=0x%04x\n", phy_id, loc, val); mutex_lock(&dev->phy_mutex); ret = asix_check_host_enable(dev, in_pm); if (ret == -ENODEV) goto out; ret = asix_write_cmd(dev, AX_CMD_WRITE_MII_REG, phy_id, (__u16)loc, 2, &res, in_pm); if (ret < 0) goto out; ret = asix_set_hw_mii(dev, in_pm); out: mutex_unlock(&dev->phy_mutex); return ret < 0 ? ret : 0; } void asix_mdio_write(struct net_device *netdev, int phy_id, int loc, int val) { __asix_mdio_write(netdev, phy_id, loc, val, false); } /* MDIO read and write wrappers for phylib */ int asix_mdio_bus_read(struct mii_bus *bus, int phy_id, int regnum) { struct usbnet *priv = bus->priv; return __asix_mdio_read(priv->net, phy_id, regnum, false); } int asix_mdio_bus_write(struct mii_bus *bus, int phy_id, int regnum, u16 val) { struct usbnet *priv = bus->priv; return __asix_mdio_write(priv->net, phy_id, regnum, val, false); } int asix_mdio_read_nopm(struct net_device *netdev, int phy_id, int loc) { return __asix_mdio_read(netdev, phy_id, loc, true); } void asix_mdio_write_nopm(struct net_device *netdev, int phy_id, int loc, int val) { __asix_mdio_write(netdev, phy_id, loc, val, true); } void asix_get_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) { struct usbnet *dev = netdev_priv(net); u8 opt; if (asix_read_cmd(dev, AX_CMD_READ_MONITOR_MODE, 0, 0, 1, &opt, 0) < 0) { wolinfo->supported = 0; wolinfo->wolopts = 0; return; } wolinfo->supported = WAKE_PHY | WAKE_MAGIC; wolinfo->wolopts = 0; if (opt & AX_MONITOR_LINK) wolinfo->wolopts |= WAKE_PHY; if (opt & AX_MONITOR_MAGIC) wolinfo->wolopts |= WAKE_MAGIC; } int asix_set_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) { struct usbnet *dev = netdev_priv(net); u8 opt = 0; if (wolinfo->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) return -EINVAL; if (wolinfo->wolopts & WAKE_PHY) opt |= AX_MONITOR_LINK; if (wolinfo->wolopts & WAKE_MAGIC) opt |= AX_MONITOR_MAGIC; if (asix_write_cmd(dev, AX_CMD_WRITE_MONITOR_MODE, opt, 0, 0, NULL, 0) < 0) return -EINVAL; return 0; } int asix_get_eeprom_len(struct net_device *net) { return AX_EEPROM_LEN; } int asix_get_eeprom(struct net_device *net, struct ethtool_eeprom *eeprom, u8 *data) { struct usbnet *dev = netdev_priv(net); u16 *eeprom_buff; int first_word, last_word; int i; if (eeprom->len == 0) return -EINVAL; eeprom->magic = AX_EEPROM_MAGIC; first_word = eeprom->offset >> 1; last_word = (eeprom->offset + eeprom->len - 1) >> 1; eeprom_buff = kmalloc_array(last_word - first_word + 1, sizeof(u16), GFP_KERNEL); if (!eeprom_buff) return -ENOMEM; /* ax8817x returns 2 bytes from eeprom on read */ for (i = first_word; i <= last_word; i++) { if (asix_read_cmd(dev, AX_CMD_READ_EEPROM, i, 0, 2, &eeprom_buff[i - first_word], 0) < 0) { kfree(eeprom_buff); return -EIO; } } memcpy(data, (u8 *)eeprom_buff + (eeprom->offset & 1), eeprom->len); kfree(eeprom_buff); return 0; } int asix_set_eeprom(struct net_device *net, struct ethtool_eeprom *eeprom, u8 *data) { struct usbnet *dev = netdev_priv(net); u16 *eeprom_buff; int first_word, last_word; int i; int ret; netdev_dbg(net, "write EEPROM len %d, offset %d, magic 0x%x\n", eeprom->len, eeprom->offset, eeprom->magic); if (eeprom->len == 0) return -EINVAL; if (eeprom->magic != AX_EEPROM_MAGIC) return -EINVAL; first_word = eeprom->offset >> 1; last_word = (eeprom->offset + eeprom->len - 1) >> 1; eeprom_buff = kmalloc_array(last_word - first_word + 1, sizeof(u16), GFP_KERNEL); if (!eeprom_buff) return -ENOMEM; /* align data to 16 bit boundaries, read the missing data from the EEPROM */ if (eeprom->offset & 1) { ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, first_word, 0, 2, &eeprom_buff[0], 0); if (ret < 0) { netdev_err(net, "Failed to read EEPROM at offset 0x%02x.\n", first_word); goto free; } } if ((eeprom->offset + eeprom->len) & 1) { ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, last_word, 0, 2, &eeprom_buff[last_word - first_word], 0); if (ret < 0) { netdev_err(net, "Failed to read EEPROM at offset 0x%02x.\n", last_word); goto free; } } memcpy((u8 *)eeprom_buff + (eeprom->offset & 1), data, eeprom->len); /* write data to EEPROM */ ret = asix_write_cmd(dev, AX_CMD_WRITE_ENABLE, 0x0000, 0, 0, NULL, 0); if (ret < 0) { netdev_err(net, "Failed to enable EEPROM write\n"); goto free; } msleep(20); for (i = first_word; i <= last_word; i++) { netdev_dbg(net, "write to EEPROM at offset 0x%02x, data 0x%04x\n", i, eeprom_buff[i - first_word]); ret = asix_write_cmd(dev, AX_CMD_WRITE_EEPROM, i, eeprom_buff[i - first_word], 0, NULL, 0); if (ret < 0) { netdev_err(net, "Failed to write EEPROM at offset 0x%02x.\n", i); goto free; } msleep(20); } ret = asix_write_cmd(dev, AX_CMD_WRITE_DISABLE, 0x0000, 0, 0, NULL, 0); if (ret < 0) { netdev_err(net, "Failed to disable EEPROM write\n"); goto free; } ret = 0; free: kfree(eeprom_buff); return ret; } void asix_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { /* Inherit standard device info */ usbnet_get_drvinfo(net, info); strscpy(info->driver, DRIVER_NAME, sizeof(info->driver)); strscpy(info->version, DRIVER_VERSION, sizeof(info->version)); } int asix_set_mac_address(struct net_device *net, void *p) { struct usbnet *dev = netdev_priv(net); struct asix_data *data = (struct asix_data *)&dev->data; struct sockaddr *addr = p; if (netif_running(net)) return -EBUSY; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; eth_hw_addr_set(net, addr->sa_data); /* We use the 20 byte dev->data * for our 6 byte mac buffer * to avoid allocating memory that * is tricky to free later */ memcpy(data->mac_addr, addr->sa_data, ETH_ALEN); asix_write_cmd_async(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr); return 0; } |
| 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for Retrode 2 controller adapter and plug-in extensions * * Copyright (c) 2017 Bastien Nocera <hadess@hadess.net> */ /* */ #include <linux/input.h> #include <linux/slab.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" #define CONTROLLER_NAME_BASE "Retrode" static int retrode_input_configured(struct hid_device *hdev, struct hid_input *hi) { struct hid_field *field = hi->report->field[0]; const char *suffix; int number = 0; char *name; switch (field->report->id) { case 0: suffix = "SNES Mouse"; break; case 1: case 2: suffix = "SNES / N64"; number = field->report->id; break; case 3: case 4: suffix = "Mega Drive"; number = field->report->id - 2; break; default: hid_err(hdev, "Got unhandled report id %d\n", field->report->id); suffix = "Unknown"; } if (number) name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s #%d", CONTROLLER_NAME_BASE, suffix, number); else name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", CONTROLLER_NAME_BASE, suffix); if (!name) return -ENOMEM; hi->input->name = name; return 0; } static int retrode_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; /* Has no effect on the mouse device */ hdev->quirks |= HID_QUIRK_MULTI_INPUT; ret = hid_parse(hdev); if (ret) return ret; ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) return ret; return 0; } static const struct hid_device_id retrode_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_FUTURE_TECHNOLOGY, USB_DEVICE_ID_RETRODE2) }, { } }; MODULE_DEVICE_TABLE(hid, retrode_devices); static struct hid_driver retrode_driver = { .name = "hid-retrode", .id_table = retrode_devices, .input_configured = retrode_input_configured, .probe = retrode_probe, }; module_hid_driver(retrode_driver); MODULE_LICENSE("GPL"); |
| 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 138 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/sort.h> #include "include/apparmor.h" #include "include/cred.h" #include "include/label.h" #include "include/policy.h" #include "include/secid.h" /* * the aa_label represents the set of profiles confining an object * * Labels maintain a reference count to the set of pointers they reference * Labels are ref counted by * tasks and object via the security field/security context off the field * code - will take a ref count on a label if it needs the label * beyond what is possible with an rcu_read_lock. * profiles - each profile is a label * secids - a pinned secid will keep a refcount of the label it is * referencing * objects - inode, files, sockets, ... * * Labels are not ref counted by the label set, so they maybe removed and * freed when no longer in use. * */ #define PROXY_POISON 97 #define LABEL_POISON 100 static void free_proxy(struct aa_proxy *proxy) { if (proxy) { /* p->label will not updated any more as p is dead */ aa_put_label(rcu_dereference_protected(proxy->label, true)); memset(proxy, 0, sizeof(*proxy)); RCU_INIT_POINTER(proxy->label, (struct aa_label *)PROXY_POISON); kfree(proxy); } } void aa_proxy_kref(struct kref *kref) { struct aa_proxy *proxy = container_of(kref, struct aa_proxy, count); free_proxy(proxy); } struct aa_proxy *aa_alloc_proxy(struct aa_label *label, gfp_t gfp) { struct aa_proxy *new; new = kzalloc(sizeof(struct aa_proxy), gfp); if (new) { kref_init(&new->count); rcu_assign_pointer(new->label, aa_get_label(label)); } return new; } /* requires profile list write lock held */ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new) { struct aa_label *tmp; AA_BUG(!orig); AA_BUG(!new); lockdep_assert_held_write(&labels_set(orig)->lock); tmp = rcu_dereference_protected(orig->proxy->label, &labels_ns(orig)->lock); rcu_assign_pointer(orig->proxy->label, aa_get_label(new)); orig->flags |= FLAG_STALE; aa_put_label(tmp); } static void __proxy_share(struct aa_label *old, struct aa_label *new) { struct aa_proxy *proxy = new->proxy; new->proxy = aa_get_proxy(old->proxy); __aa_proxy_redirect(old, new); aa_put_proxy(proxy); } /** * ns_cmp - compare ns for label set ordering * @a: ns to compare (NOT NULL) * @b: ns to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int ns_cmp(struct aa_ns *a, struct aa_ns *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b) return 0; res = a->level - b->level; if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * profile_cmp - profile comparison for set ordering * @a: profile to compare (NOT NULL) * @b: profile to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int profile_cmp(struct aa_profile *a, struct aa_profile *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->ns); AA_BUG(!b->ns); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b || a->base.hname == b->base.hname) return 0; res = ns_cmp(a->ns, b->ns); if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * vec_cmp - label comparison for set ordering * @a: label to compare (NOT NULL) * @vec: vector of profiles to compare (NOT NULL) * @n: length of @vec * * Returns: <0 if a < vec * ==0 if a == vec * >0 if a > vec */ static int vec_cmp(struct aa_profile **a, int an, struct aa_profile **b, int bn) { int i; AA_BUG(!a); AA_BUG(!*a); AA_BUG(!b); AA_BUG(!*b); AA_BUG(an <= 0); AA_BUG(bn <= 0); for (i = 0; i < an && i < bn; i++) { int res = profile_cmp(a[i], b[i]); if (res != 0) return res; } return an - bn; } static bool vec_is_stale(struct aa_profile **vec, int n) { int i; AA_BUG(!vec); for (i = 0; i < n; i++) { if (profile_is_stale(vec[i])) return true; } return false; } static long accum_vec_flags(struct aa_profile **vec, int n) { long u = FLAG_UNCONFINED; int i; AA_BUG(!vec); for (i = 0; i < n; i++) { u |= vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | FLAG_STALE); if (!(u & vec[i]->label.flags & FLAG_UNCONFINED)) u &= ~FLAG_UNCONFINED; } return u; } static int sort_cmp(const void *a, const void *b) { return profile_cmp(*(struct aa_profile **)a, *(struct aa_profile **)b); } /* * assumes vec is sorted * Assumes @vec has null terminator at vec[n], and will null terminate * vec[n - dups] */ static inline int unique(struct aa_profile **vec, int n) { int i, pos, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); pos = 0; for (i = 1; i < n; i++) { int res = profile_cmp(vec[pos], vec[i]); AA_BUG(res > 0, "vec not sorted"); if (res == 0) { /* drop duplicate */ aa_put_profile(vec[i]); dups++; continue; } pos++; if (dups) vec[pos] = vec[i]; } AA_BUG(dups < 0); return dups; } /** * aa_vec_unique - canonical sort and unique a list of profiles * @n: number of refcounted profiles in the list (@n > 0) * @vec: list of profiles to sort and merge * * Returns: the number of duplicates eliminated == references put * * If @flags & VEC_FLAG_TERMINATE @vec has null terminator at vec[n], and will * null terminate vec[n - dups] */ int aa_vec_unique(struct aa_profile **vec, int n, int flags) { int i, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); /* vecs are usually small and inorder, have a fallback for larger */ if (n > 8) { sort(vec, n, sizeof(struct aa_profile *), sort_cmp, NULL); dups = unique(vec, n); goto out; } /* insertion sort + unique in one */ for (i = 1; i < n; i++) { struct aa_profile *tmp = vec[i]; int pos, j; for (pos = i - 1 - dups; pos >= 0; pos--) { int res = profile_cmp(vec[pos], tmp); if (res == 0) { /* drop duplicate entry */ aa_put_profile(tmp); dups++; goto continue_outer; } else if (res < 0) break; } /* pos is at entry < tmp, or index -1. Set to insert pos */ pos++; for (j = i - dups; j > pos; j--) vec[j] = vec[j - 1]; vec[pos] = tmp; continue_outer: ; } AA_BUG(dups < 0); out: if (flags & VEC_FLAG_TERMINATE) vec[n - dups] = NULL; return dups; } void aa_label_destroy(struct aa_label *label) { AA_BUG(!label); if (!label_isprofile(label)) { struct aa_profile *profile; struct label_it i; aa_put_str(label->hname); label_for_each(i, label, profile) { aa_put_profile(profile); label->vec[i.i] = (struct aa_profile *) (LABEL_POISON + (long) i.i); } } if (label->proxy) { if (rcu_dereference_protected(label->proxy->label, true) == label) rcu_assign_pointer(label->proxy->label, NULL); aa_put_proxy(label->proxy); } aa_free_secid(label->secid); label->proxy = (struct aa_proxy *) PROXY_POISON + 1; } void aa_label_free(struct aa_label *label) { if (!label) return; aa_label_destroy(label); kfree(label); } static void label_free_switch(struct aa_label *label) { if (label->flags & FLAG_NS_COUNT) aa_free_ns(labels_ns(label)); else if (label_isprofile(label)) aa_free_profile(labels_profile(label)); else aa_label_free(label); } static void label_free_rcu(struct rcu_head *head) { struct aa_label *label = container_of(head, struct aa_label, rcu); if (label->flags & FLAG_IN_TREE) (void) aa_label_remove(label); label_free_switch(label); } void aa_label_kref(struct kref *kref) { struct aa_label *label = container_of(kref, struct aa_label, count); struct aa_ns *ns = labels_ns(label); if (!ns) { /* never live, no rcu callback needed, just using the fn */ label_free_switch(label); return; } /* TODO: update labels_profile macro so it works here */ AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.profiles)); AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.list)); /* TODO: if compound label and not stale add to reclaim cache */ call_rcu(&label->rcu, label_free_rcu); } static void label_free_or_put_new(struct aa_label *label, struct aa_label *new) { if (label != new) /* need to free directly to break circular ref with proxy */ aa_label_free(new); else aa_put_label(new); } bool aa_label_init(struct aa_label *label, int size, gfp_t gfp) { AA_BUG(!label); AA_BUG(size < 1); if (aa_alloc_secid(label, gfp) < 0) return false; label->size = size; /* doesn't include null */ label->vec[size] = NULL; /* null terminate */ kref_init(&label->count); RB_CLEAR_NODE(&label->node); return true; } /** * aa_label_alloc - allocate a label with a profile vector of @size length * @size: size of profile vector in the label * @proxy: proxy to use OR null if to allocate a new one * @gfp: memory allocation type * * Returns: new label * else NULL if failed */ struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp) { struct aa_label *new; AA_BUG(size < 1); /* + 1 for null terminator entry on vec */ new = kzalloc(struct_size(new, vec, size + 1), gfp); AA_DEBUG("%s (%p)\n", __func__, new); if (!new) goto fail; if (!aa_label_init(new, size, gfp)) goto fail; if (!proxy) { proxy = aa_alloc_proxy(new, gfp); if (!proxy) goto fail; } else aa_get_proxy(proxy); /* just set new's proxy, don't redirect proxy here if it was passed in*/ new->proxy = proxy; return new; fail: kfree(new); return NULL; } /** * label_cmp - label comparison for set ordering * @a: label to compare (NOT NULL) * @b: label to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_cmp(struct aa_label *a, struct aa_label *b) { AA_BUG(!b); if (a == b) return 0; return vec_cmp(a->vec, a->size, b->vec, b->size); } /* helper fn for label_for_each_confined */ int aa_label_next_confined(struct aa_label *label, int i) { AA_BUG(!label); AA_BUG(i < 0); for (; i < label->size; i++) { if (!profile_unconfined(label->vec[i])) return i; } return i; } /** * __aa_label_next_not_in_set - return the next profile of @sub not in @set * @I: label iterator * @set: label to test against * @sub: label to if is subset of @set * * Returns: profile in @sub that is not in @set, with iterator set pos after * else NULL if @sub is a subset of @set */ struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub) { AA_BUG(!set); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > set->size); AA_BUG(!sub); AA_BUG(I->j < 0); AA_BUG(I->j > sub->size); while (I->j < sub->size && I->i < set->size) { int res = profile_cmp(sub->vec[I->j], set->vec[I->i]); if (res == 0) { (I->j)++; (I->i)++; } else if (res > 0) (I->i)++; else return sub->vec[(I->j)++]; } if (I->j < sub->size) return sub->vec[(I->j)++]; return NULL; } /** * aa_label_is_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * Returns: true if @sub is subset of @set * else false */ bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; return __aa_label_next_not_in_set(&i, set, sub) == NULL; } /** * aa_label_is_unconfined_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * This checks for subset but taking into account unconfined. IF * @sub contains an unconfined profile that does not have a matching * unconfined in @set then this will not cause the test to fail. * Conversely we don't care about an unconfined in @set that is not in * @sub * * Returns: true if @sub is special_subset of @set * else false */ bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; struct aa_profile *p; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; do { p = __aa_label_next_not_in_set(&i, set, sub); if (p && !profile_unconfined(p)) break; } while (p); return p == NULL; } /** * __label_remove - remove @label from the label set * @l: label to remove * @new: label to redirect to * * Requires: labels_set(@label)->lock write_lock * Returns: true if the label was in the tree and removed */ static bool __label_remove(struct aa_label *label, struct aa_label *new) { struct aa_labelset *ls = labels_set(label); AA_BUG(!ls); AA_BUG(!label); lockdep_assert_held_write(&ls->lock); if (new) __aa_proxy_redirect(label, new); if (!label_is_stale(label)) __label_make_stale(label); if (label->flags & FLAG_IN_TREE) { rb_erase(&label->node, &ls->root); label->flags &= ~FLAG_IN_TREE; return true; } return false; } /** * __label_replace - replace @old with @new in label set * @old: label to remove from label set * @new: label to replace @old with * * Requires: labels_set(@old)->lock write_lock * valid ref count be held on @new * Returns: true if @old was in set and replaced by @new * * Note: current implementation requires label set be order in such a way * that @new directly replaces @old position in the set (ie. * using pointer comparison of the label address would not work) */ static bool __label_replace(struct aa_label *old, struct aa_label *new) { struct aa_labelset *ls = labels_set(old); AA_BUG(!ls); AA_BUG(!old); AA_BUG(!new); lockdep_assert_held_write(&ls->lock); AA_BUG(new->flags & FLAG_IN_TREE); if (!label_is_stale(old)) __label_make_stale(old); if (old->flags & FLAG_IN_TREE) { rb_replace_node(&old->node, &new->node, &ls->root); old->flags &= ~FLAG_IN_TREE; new->flags |= FLAG_IN_TREE; return true; } return false; } /** * __label_insert - attempt to insert @l into a label set * @ls: set of labels to insert @l into (NOT NULL) * @label: new label to insert (NOT NULL) * @replace: whether insertion should replace existing entry that is not stale * * Requires: @ls->lock * caller to hold a valid ref on l * if @replace is true l has a preallocated proxy associated * Returns: @l if successful in inserting @l - with additional refcount * else ref counted equivalent label that is already in the set, * the else condition only happens if @replace is false */ static struct aa_label *__label_insert(struct aa_labelset *ls, struct aa_label *label, bool replace) { struct rb_node **new, *parent = NULL; AA_BUG(!ls); AA_BUG(!label); AA_BUG(labels_set(label) != ls); lockdep_assert_held_write(&ls->lock); AA_BUG(label->flags & FLAG_IN_TREE); /* Figure out where to put new node */ new = &ls->root.rb_node; while (*new) { struct aa_label *this = rb_entry(*new, struct aa_label, node); int result = label_cmp(label, this); parent = *new; if (result == 0) { /* !__aa_get_label means queued for destruction, * so replace in place, however the label has * died before the replacement so do not share * the proxy */ if (!replace && !label_is_stale(this)) { if (__aa_get_label(this)) return this; } else __proxy_share(this, label); AA_BUG(!__label_replace(this, label)); return aa_get_label(label); } else if (result < 0) new = &((*new)->rb_left); else /* (result > 0) */ new = &((*new)->rb_right); } /* Add new node and rebalance tree. */ rb_link_node(&label->node, parent, new); rb_insert_color(&label->node, &ls->root); label->flags |= FLAG_IN_TREE; return aa_get_label(label); } /** * __vec_find - find label that matches @vec in label set * @vec: vec of profiles to find matching label for (NOT NULL) * @n: length of @vec * * Requires: @vec_labelset(vec) lock held * caller to hold a valid ref on l * * Returns: ref counted @label if matching label is in tree * ref counted label that is equiv to @l in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *__vec_find(struct aa_profile **vec, int n) { struct rb_node *node; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); node = vec_labelset(vec, n)->root.rb_node; while (node) { struct aa_label *this = rb_entry(node, struct aa_label, node); int result = vec_cmp(this->vec, this->size, vec, n); if (result > 0) node = node->rb_left; else if (result < 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * __label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: labels_set(@label)->lock held * caller to hold a valid ref on l * * Returns: ref counted @label if @label is in tree OR * ref counted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ static struct aa_label *__label_find(struct aa_label *label) { AA_BUG(!label); return __vec_find(label->vec, label->size); } /** * aa_label_remove - remove a label from the labelset * @label: label to remove * * Returns: true if @label was removed from the tree * else @label was not in tree so it could not be removed */ bool aa_label_remove(struct aa_label *label) { struct aa_labelset *ls = labels_set(label); unsigned long flags; bool res; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); res = __label_remove(label, ns_unconfined(labels_ns(label))); write_unlock_irqrestore(&ls->lock, flags); return res; } /** * aa_label_replace - replace a label @old with a new version @new * @old: label to replace * @new: label replacing @old * * Returns: true if @old was in tree and replaced * else @old was not in tree, and @new was not inserted */ bool aa_label_replace(struct aa_label *old, struct aa_label *new) { unsigned long flags; bool res; if (name_is_shared(old, new) && labels_ns(old) == labels_ns(new)) { write_lock_irqsave(&labels_set(old)->lock, flags); if (old->proxy != new->proxy) __proxy_share(old, new); else __aa_proxy_redirect(old, new); res = __label_replace(old, new); write_unlock_irqrestore(&labels_set(old)->lock, flags); } else { struct aa_label *l; struct aa_labelset *ls = labels_set(old); write_lock_irqsave(&ls->lock, flags); res = __label_remove(old, new); if (labels_ns(old) != labels_ns(new)) { write_unlock_irqrestore(&ls->lock, flags); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); } l = __label_insert(ls, new, true); res = (l == new); write_unlock_irqrestore(&ls->lock, flags); aa_put_label(l); } return res; } /** * vec_find - find label @l in label set * @vec: array of profiles to find equiv label for (NOT NULL) * @n: length of @vec * * Returns: refcounted label if @vec equiv is in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *vec_find(struct aa_profile **vec, int n) { struct aa_labelset *ls; struct aa_label *label; unsigned long flags; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); ls = vec_labelset(vec, n); read_lock_irqsave(&ls->lock, flags); label = __vec_find(vec, n); read_unlock_irqrestore(&ls->lock, flags); return label; } /* requires sort and merge done first */ static struct aa_label *vec_create_and_insert_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = NULL; struct aa_labelset *ls; unsigned long flags; struct aa_label *new; int i; AA_BUG(!vec); if (len == 1) return aa_get_label(&vec[0]->label); ls = labels_set(&vec[len - 1]->label); /* TODO: enable when read side is lockless * check if label exists before taking locks */ new = aa_label_alloc(len, NULL, gfp); if (!new) return NULL; for (i = 0; i < len; i++) new->vec[i] = aa_get_profile(vec[i]); write_lock_irqsave(&ls->lock, flags); label = __label_insert(ls, new, false); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(label, new); return label; } struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = vec_find(vec, len); if (label) return label; return vec_create_and_insert_label(vec, len, gfp); } /** * aa_label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: caller to hold a valid ref on l * * Returns: refcounted @label if @label is in tree * refcounted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ struct aa_label *aa_label_find(struct aa_label *label) { AA_BUG(!label); return vec_find(label->vec, label->size); } /** * aa_label_insert - insert label @label into @ls or return existing label * @ls - labelset to insert @label into * @label - label to insert * * Requires: caller to hold a valid ref on @label * * Returns: ref counted @label if successful in inserting @label * else ref counted equivalent label that is already in the set */ struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *label) { struct aa_label *l; unsigned long flags; AA_BUG(!ls); AA_BUG(!label); /* check if label exists before taking lock */ if (!label_is_stale(label)) { read_lock_irqsave(&ls->lock, flags); l = __label_find(label); read_unlock_irqrestore(&ls->lock, flags); if (l) return l; } write_lock_irqsave(&ls->lock, flags); l = __label_insert(ls, label, false); write_unlock_irqrestore(&ls->lock, flags); return l; } /** * aa_label_next_in_merge - find the next profile when merging @a and @b * @I: label iterator * @a: label to merge * @b: label to merge * * Returns: next profile * else null if no more profiles */ struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b) { AA_BUG(!a); AA_BUG(!b); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > a->size); AA_BUG(I->j < 0); AA_BUG(I->j > b->size); if (I->i < a->size) { if (I->j < b->size) { int res = profile_cmp(a->vec[I->i], b->vec[I->j]); if (res > 0) return b->vec[(I->j)++]; if (res == 0) (I->j)++; } return a->vec[(I->i)++]; } if (I->j < b->size) return b->vec[(I->j)++]; return NULL; } /** * label_merge_cmp - cmp of @a merging with @b against @z for set ordering * @a: label to merge then compare (NOT NULL) * @b: label to merge then compare (NOT NULL) * @z: label to compare merge against (NOT NULL) * * Assumes: using the most recent versions of @a, @b, and @z * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_merge_cmp(struct aa_label *a, struct aa_label *b, struct aa_label *z) { struct aa_profile *p = NULL; struct label_it i = { }; int k; AA_BUG(!a); AA_BUG(!b); AA_BUG(!z); for (k = 0; k < z->size && (p = aa_label_next_in_merge(&i, a, b)); k++) { int res = profile_cmp(p, z->vec[k]); if (res != 0) return res; } if (p) return 1; else if (k < z->size) return -1; return 0; } /** * label_merge_insert - create a new label by merging @a and @b * @new: preallocated label to merge into (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: preallocated proxy * * Returns: ref counted label either @new if merge is unique * @a if @b is a subset of @a * @b if @a is a subset of @b * * NOTE: will not use @new if the merge results in @new == @a or @b * * Must be used within labelset write lock to avoid racing with * setting labels stale. */ static struct aa_label *label_merge_insert(struct aa_label *new, struct aa_label *a, struct aa_label *b) { struct aa_label *label; struct aa_labelset *ls; struct aa_profile *next; struct label_it i; unsigned long flags; int k = 0, invcount = 0; bool stale = false; AA_BUG(!a); AA_BUG(a->size < 0); AA_BUG(!b); AA_BUG(b->size < 0); AA_BUG(!new); AA_BUG(new->size < a->size + b->size); label_for_each_in_merge(i, a, b, next) { AA_BUG(!next); if (profile_is_stale(next)) { new->vec[k] = aa_get_newest_profile(next); AA_BUG(!new->vec[k]->label.proxy); AA_BUG(!new->vec[k]->label.proxy->label); if (next->label.proxy != new->vec[k]->label.proxy) invcount++; k++; stale = true; } else new->vec[k++] = aa_get_profile(next); } /* set to actual size which is <= allocated len */ new->size = k; new->vec[k] = NULL; if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { label = aa_get_label(&new->vec[0]->label); return label; } } else if (!stale) { /* * merge could be same as a || b, note: it is not possible * for new->size == a->size == b->size unless a == b */ if (k == a->size) return aa_get_label(a); else if (k == b->size) return aa_get_label(b); } new->flags |= accum_vec_flags(new->vec, new->size); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); label = __label_insert(labels_set(new), new, false); write_unlock_irqrestore(&ls->lock, flags); return label; } /** * labelset_of_merge - find which labelset a merged label should be inserted * @a: label to merge and insert * @b: label to merge and insert * * Returns: labelset that the merged label should be inserted into */ static struct aa_labelset *labelset_of_merge(struct aa_label *a, struct aa_label *b) { struct aa_ns *nsa = labels_ns(a); struct aa_ns *nsb = labels_ns(b); if (ns_cmp(nsa, nsb) <= 0) return &nsa->labels; return &nsb->labels; } /** * __label_find_merge - find label that is equiv to merge of @a and @b * @ls: set of labels to search (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: ls->lock read_lock held * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ static struct aa_label *__label_find_merge(struct aa_labelset *ls, struct aa_label *a, struct aa_label *b) { struct rb_node *node; AA_BUG(!ls); AA_BUG(!a); AA_BUG(!b); if (a == b) return __label_find(a); node = ls->root.rb_node; while (node) { struct aa_label *this = container_of(node, struct aa_label, node); int result = label_merge_cmp(a, b, this); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * aa_label_find_merge - find label that is equiv to merge of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: labels be fully constructed with a valid ns * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b) { struct aa_labelset *ls; struct aa_label *label, *ar = NULL, *br = NULL; unsigned long flags; AA_BUG(!a); AA_BUG(!b); if (label_is_stale(a)) a = ar = aa_get_newest_label(a); if (label_is_stale(b)) b = br = aa_get_newest_label(b); ls = labelset_of_merge(a, b); read_lock_irqsave(&ls->lock, flags); label = __label_find_merge(ls, a, b); read_unlock_irqrestore(&ls->lock, flags); aa_put_label(ar); aa_put_label(br); return label; } /** * aa_label_merge - attempt to insert new merged label of @a and @b * @ls: set of labels to insert label into (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * @gfp: memory allocation type * * Requires: caller to hold valid refs on @a and @b * labels be fully constructed with a valid ns * * Returns: ref counted new label if successful in inserting merge of a & b * else ref counted equivalent label that is already in the set. * else NULL if could not create label (-ENOMEM) */ struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp) { struct aa_label *label = NULL; AA_BUG(!a); AA_BUG(!b); if (a == b) return aa_get_newest_label(a); /* TODO: enable when read side is lockless * check if label exists before taking locks if (!label_is_stale(a) && !label_is_stale(b)) label = aa_label_find_merge(a, b); */ if (!label) { struct aa_label *new; a = aa_get_newest_label(a); b = aa_get_newest_label(b); /* could use label_merge_len(a, b), but requires double * comparison for small savings */ new = aa_label_alloc(a->size + b->size, NULL, gfp); if (!new) goto out; label = label_merge_insert(new, a, b); label_free_or_put_new(label, new); out: aa_put_label(a); aa_put_label(b); } return label; } /* match a profile and its associated ns component if needed * Assumes visibility test has already been done. * If a subns profile is not to be matched should be prescreened with * visibility test. */ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_profile *tp, aa_state_t state) { const char *ns_name; if (profile->ns == tp->ns) return aa_dfa_match(rules->policy.dfa, state, tp->base.hname); /* try matching with namespace name and then profile */ ns_name = aa_ns_name(profile->ns, tp->ns, true); state = aa_dfa_match_len(rules->policy.dfa, state, ":", 1); state = aa_dfa_match(rules->policy.dfa, state, ns_name); state = aa_dfa_match_len(rules->policy.dfa, state, ":", 1); return aa_dfa_match(rules->policy.dfa, state, tp->base.hname); } /** * label_compound_match - find perms for full compound label * @profile: profile to find perms for * @label: label to check access permissions for * @start: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: perms struct to set * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for A//&B//&C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_compound_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; /* find first subcomponent that is visible */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, state); if (!state) goto fail; goto next; } /* no component visible */ *perms = allperms; return 0; next: label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = aa_dfa_match(rules->policy.dfa, state, "//&"); state = match_component(profile, rules, tp, state); if (!state) goto fail; } *perms = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return state; } /** * label_components_match - find perms for all subcomponents of a label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @start: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: an initialized perms struct to add accumulation to * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for each of A and B and C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_components_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; struct aa_perms tmp; aa_state_t state = 0; /* find first subcomponent to test */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; goto next; } /* no subcomponents visible - no change in perms */ return 0; next: tmp = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; tmp = *aa_lookup_perms(&rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return -EACCES; } /** * aa_label_match - do a multi-component label match * @profile: profile to match against (NOT NULL) * @rules: ruleset to search * @label: label to match (NOT NULL) * @state: state to start in * @subns: whether to match subns components * @request: permission request * @perms: Returns computed perms (NOT NULL) * * Returns: the state the match finished in, may be the none matching state */ int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { int error = label_compound_match(profile, rules, label, state, subns, request, perms); if (!error) return error; *perms = allperms; return label_components_match(profile, rules, label, state, subns, request, perms); } /** * aa_update_label_name - update a label to have a stored name * @ns: ns being viewed from (NOT NULL) * @label: label to update (NOT NULL) * @gfp: type of memory allocation * * Requires: labels_set(label) not locked in caller * * note: only updates the label name if it does not have a name already * and if it is in the labelset */ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) { struct aa_labelset *ls; unsigned long flags; char __counted *name; bool res = false; AA_BUG(!ns); AA_BUG(!label); if (label->hname || labels_ns(label) != ns) return res; if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) < 0) return res; ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); if (!label->hname && label->flags & FLAG_IN_TREE) { label->hname = name; res = true; } else aa_put_str(name); write_unlock_irqrestore(&ls->lock, flags); return res; } /* * cached label name is present and visible * @label->hname only exists if label is namespace hierachical */ static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, int flags) { if (label->hname && (!ns || labels_ns(label) == ns) && !(flags & ~FLAG_SHOW_MODE)) return true; return false; } /* helper macro for snprint routines */ #define update_for_len(total, len, size, str) \ do { \ size_t ulen = len; \ \ AA_BUG(len < 0); \ total += ulen; \ ulen = min(ulen, size); \ size -= ulen; \ str += ulen; \ } while (0) /** * aa_profile_snxprint - print a profile name to a buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @view: namespace profile is being viewed from * @profile: profile to view (NOT NULL) * @flags: whether to include the mode string * @prev_ns: last ns printed when used in compound print * * Returns: size of name written or would be written if larger than * available buffer * * Note: will not print anything if the profile is not visible */ static int aa_profile_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_profile *profile, int flags, struct aa_ns **prev_ns) { const char *ns_name = NULL; AA_BUG(!str && size != 0); AA_BUG(!profile); if (!view) view = profiles_ns(profile); if (view != profile->ns && (!prev_ns || (*prev_ns != profile->ns))) { if (prev_ns) *prev_ns = profile->ns; ns_name = aa_ns_name(view, profile->ns, flags & FLAG_VIEW_SUBNS); if (ns_name == aa_hidden_ns_name) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", ns_name); } } if ((flags & FLAG_SHOW_MODE) && profile != profile->ns->unconfined) { const char *modestr = aa_profile_mode_names[profile->mode]; if (ns_name) return snprintf(str, size, ":%s:%s (%s)", ns_name, profile->base.hname, modestr); return snprintf(str, size, "%s (%s)", profile->base.hname, modestr); } if (ns_name) return snprintf(str, size, ":%s:%s", ns_name, profile->base.hname); return snprintf(str, size, "%s", profile->base.hname); } static const char *label_modename(struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct label_it i; int mode = -1, count = 0; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { count++; if (profile == profile->ns->unconfined) /* special case unconfined so stacks with * unconfined don't report as mixed. ie. * profile_foo//&:ns1:unconfined (mixed) */ continue; if (mode == -1) mode = profile->mode; else if (mode != profile->mode) return "mixed"; } } if (count == 0) return "-"; if (mode == -1) /* everything was unconfined */ mode = APPARMOR_UNCONFINED; return aa_profile_mode_names[mode]; } /* if any visible label is not unconfined the display_mode returns true */ static inline bool display_mode(struct aa_ns *ns, struct aa_label *label, int flags) { if ((flags & FLAG_SHOW_MODE)) { struct aa_profile *profile; struct label_it i; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS) && profile != profile->ns->unconfined) return true; } /* only ns->unconfined in set of profiles in ns */ return false; } return false; } /** * aa_label_snxprint - print a label name to a string buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: whether to include the mode string * * Returns: size of name written or would be written if larger than * available buffer * * Note: labels do not have to be strictly hierarchical to the ns as * objects may be shared across different namespaces and thus * pickup labeling from each ns. If a particular part of the * label is not visible it will just be excluded. And if none * of the label is visible "---" will be used. */ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct aa_ns *prev_ns = NULL; struct label_it i; int count = 0, total = 0; ssize_t len; AA_BUG(!str && size != 0); AA_BUG(!label); if (AA_DEBUG_LABEL && (flags & FLAG_ABS_ROOT)) { ns = root_ns; len = snprintf(str, size, "_"); update_for_len(total, len, size, str); } else if (!ns) { ns = labels_ns(label); } label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { if (count > 0) { len = snprintf(str, size, "//&"); update_for_len(total, len, size, str); } len = aa_profile_snxprint(str, size, ns, profile, flags & FLAG_VIEW_SUBNS, &prev_ns); update_for_len(total, len, size, str); count++; } } if (count == 0) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", aa_hidden_ns_name); } /* count == 1 && ... is for backwards compat where the mode * is not displayed for 'unconfined' in the current ns */ if (display_mode(ns, label, flags)) { len = snprintf(str, size, " (%s)", label_modename(ns, label, flags)); update_for_len(total, len, size, str); } return total; } #undef update_for_len /** * aa_label_asxprint - allocate a string buffer and print label into it * @strp: Returns - the allocated buffer with the label name. (NOT NULL) * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = kmalloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } /** * aa_label_acntsxprint - allocate a __counted string buffer and print label * @strp: buffer to write to. * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = aa_str_alloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { const char *str; char *name = NULL; int len; AA_BUG(!ab); AA_BUG(!label); if (!use_label_hname(ns, label, flags) || display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } str = name; } else { str = (char *) label->hname; len = strlen(str); } if (audit_string_contains_control(str, len)) audit_log_n_hex(ab, str, len); else audit_log_n_string(ab, str, len); kfree(name); } void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!f); AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } seq_puts(f, str); kfree(str); } else if (display_mode(ns, label, flags)) seq_printf(f, "%s (%s)", label->hname, label_modename(ns, label, flags)); else seq_puts(f, label->hname); } void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG("label print error"); return; } pr_info("%s", str); kfree(str); } else if (display_mode(ns, label, flags)) pr_info("%s (%s)", label->hname, label_modename(ns, label, flags)); else pr_info("%s", label->hname); } void aa_label_audit(struct audit_buffer *ab, struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xaudit(ab, ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } void aa_label_seq_print(struct seq_file *f, struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_seq_xprint(f, ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } void aa_label_printk(struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xprintk(ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } static int label_count_strn_entries(const char *str, size_t n) { const char *end = str + n; const char *split; int count = 1; AA_BUG(!str); for (split = aa_label_strn_split(str, end - str); split; split = aa_label_strn_split(str, end - str)) { count++; str = split + 3; } return count; } /* * ensure stacks with components like * :ns:A//&B * have :ns: applied to both 'A' and 'B' by making the lookup relative * to the base if the lookup specifies an ns, else making the stacked lookup * relative to the last embedded ns in the string. */ static struct aa_profile *fqlookupn_profile(struct aa_label *base, struct aa_label *currentbase, const char *str, size_t n) { const char *first = skipn_spaces(str, n); if (first && *first == ':') return aa_fqlookupn_profile(base, str, n); return aa_fqlookupn_profile(currentbase, str, n); } /** * aa_label_strn_parse - parse, validate and convert a text string to a label * @base: base label to use for lookups (NOT NULL) * @str: null terminated text string (NOT NULL) * @n: length of str to parse, will stop at \0 if encountered before n * @gfp: allocation type * @create: true if should create compound labels if they don't exist * @force_stack: true if should stack even if no leading & * * Returns: the matching refcounted label if present * else ERRPTR */ struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack) { DEFINE_VEC(profile, vec); struct aa_label *label, *currbase = base; int i, len, stack = 0, error; const char *end = str + n; const char *split; AA_BUG(!base); AA_BUG(!str); str = skipn_spaces(str, n); if (str == NULL || (AA_DEBUG_LABEL && *str == '_' && base != &root_ns->unconfined->label)) return ERR_PTR(-EINVAL); len = label_count_strn_entries(str, end - str); if (*str == '&' || force_stack) { /* stack on top of base */ stack = base->size; len += stack; if (*str == '&') str++; } error = vec_setup(profile, vec, len, gfp); if (error) return ERR_PTR(error); for (i = 0; i < stack; i++) vec[i] = aa_get_profile(base->vec[i]); for (split = aa_label_strn_split(str, end - str), i = stack; split && i < len; i++) { vec[i] = fqlookupn_profile(base, currbase, str, split - str); if (!vec[i]) goto fail; /* * if component specified a new ns it becomes the new base * so that subsequent lookups are relative to it */ if (vec[i]->ns != labels_ns(currbase)) currbase = &vec[i]->label; str = split + 3; split = aa_label_strn_split(str, end - str); } /* last element doesn't have a split */ if (i < len) { vec[i] = fqlookupn_profile(base, currbase, str, end - str); if (!vec[i]) goto fail; } if (len == 1) /* no need to free vec as len < LOCAL_VEC_ENTRIES */ return &vec[0]->label; len -= aa_vec_unique(vec, len, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (len == 1) { label = aa_get_label(&vec[0]->label); goto out; } if (create) label = aa_vec_find_or_create_label(vec, len, gfp); else label = vec_find(vec, len); if (!label) goto fail; out: /* use adjusted len from after vec_unique, not original */ vec_cleanup(profile, vec, len); return label; fail: label = ERR_PTR(-ENOENT); goto out; } struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack) { return aa_label_strn_parse(base, str, strlen(str), gfp, create, force_stack); } /** * aa_labelset_destroy - remove all labels from the label set * @ls: label set to cleanup (NOT NULL) * * Labels that are removed from the set may still exist beyond the set * being destroyed depending on their reference counting */ void aa_labelset_destroy(struct aa_labelset *ls) { struct rb_node *node; unsigned long flags; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); for (node = rb_first(&ls->root); node; node = rb_first(&ls->root)) { struct aa_label *this = rb_entry(node, struct aa_label, node); if (labels_ns(this) != root_ns) __label_remove(this, ns_unconfined(labels_ns(this)->parent)); else __label_remove(this, NULL); } write_unlock_irqrestore(&ls->lock, flags); } /* * @ls: labelset to init (NOT NULL) */ void aa_labelset_init(struct aa_labelset *ls) { AA_BUG(!ls); rwlock_init(&ls->lock); ls->root = RB_ROOT; } static struct aa_label *labelset_next_stale(struct aa_labelset *ls) { struct aa_label *label; struct rb_node *node; unsigned long flags; AA_BUG(!ls); read_lock_irqsave(&ls->lock, flags); __labelset_for_each(ls, node) { label = rb_entry(node, struct aa_label, node); if ((label_is_stale(label) || vec_is_stale(label->vec, label->size)) && __aa_get_label(label)) goto out; } label = NULL; out: read_unlock_irqrestore(&ls->lock, flags); return label; } /** * __label_update - insert updated version of @label into labelset * @label - the label to update/replace * * Returns: new label that is up to date * else NULL on failure * * Requires: @ns lock be held * * Note: worst case is the stale @label does not get updated and has * to be updated at a later time. */ static struct aa_label *__label_update(struct aa_label *label) { struct aa_label *new, *tmp; struct aa_labelset *ls; unsigned long flags; int i, invcount = 0; AA_BUG(!label); AA_BUG(!mutex_is_locked(&labels_ns(label)->lock)); new = aa_label_alloc(label->size, label->proxy, GFP_KERNEL); if (!new) return NULL; /* * while holding the ns_lock will stop profile replacement, removal, * and label updates, label merging and removal can be occurring */ ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); for (i = 0; i < label->size; i++) { AA_BUG(!label->vec[i]); new->vec[i] = aa_get_newest_profile(label->vec[i]); AA_BUG(!new->vec[i]); AA_BUG(!new->vec[i]->label.proxy); AA_BUG(!new->vec[i]->label.proxy->label); if (new->vec[i]->label.proxy != label->vec[i]->label.proxy) invcount++; } /* updated stale label by being removed/renamed from labelset */ if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { tmp = aa_get_label(&new->vec[0]->label); AA_BUG(tmp == label); goto remove; } if (labels_set(label) != labels_set(new)) { write_unlock_irqrestore(&ls->lock, flags); tmp = aa_label_insert(labels_set(new), new); write_lock_irqsave(&ls->lock, flags); goto remove; } } else AA_BUG(labels_ns(label) != labels_ns(new)); tmp = __label_insert(labels_set(label), new, true); remove: /* ensure label is removed, and redirected correctly */ __label_remove(label, tmp); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(tmp, new); return tmp; } /** * __labelset_update - update labels in @ns * @ns: namespace to update labels in (NOT NULL) * * Requires: @ns lock be held * * Walk the labelset ensuring that all labels are up to date and valid * Any label that has a stale component is marked stale and replaced and * by an updated version. * * If failures happen due to memory pressures then stale labels will * be left in place until the next pass. */ static void __labelset_update(struct aa_ns *ns) { struct aa_label *label; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); do { label = labelset_next_stale(&ns->labels); if (label) { struct aa_label *l = __label_update(label); aa_put_label(l); aa_put_label(label); } } while (label); } /** * __aa_labelset_update_subtree - update all labels with a stale component * @ns: ns to start update at (NOT NULL) * * Requires: @ns lock be held * * Invalidates labels based on @p in @ns and any children namespaces. */ void __aa_labelset_update_subtree(struct aa_ns *ns) { struct aa_ns *child; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); __labelset_update(ns); list_for_each_entry(child, &ns->sub_ns, base.list) { mutex_lock_nested(&child->lock, child->level); __aa_labelset_update_subtree(child); mutex_unlock(&child->lock); } } |
| 723 723 723 723 1043 1043 1042 20 969 969 969 969 928 928 723 737 723 737 34 1043 1042 738 674 26 674 738 737 738 146 2 723 722 682 678 17 17 17 17 16 17 1045 1044 1043 16 1042 1045 1042 1043 4 1043 1043 1037 995 995 995 116 22 1 970 114 971 972 972 928 972 107 102 106 106 106 102 94 94 68 68 68 38 107 107 737 737 723 723 723 723 1 723 723 41 737 682 682 682 682 682 678 678 2 682 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET An implementation of the SOCKET network access protocol. * * Version: @(#)socket.c 1.1.93 18/02/95 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Anonymous : NOTSOCK/BADF cleanup. Error fix in * shutdown() * Alan Cox : verify_area() fixes * Alan Cox : Removed DDI * Jonathan Kamens : SOCK_DGRAM reconnect bug * Alan Cox : Moved a load of checks to the very * top level. * Alan Cox : Move address structures to/from user * mode above the protocol layers. * Rob Janssen : Allow 0 length sends. * Alan Cox : Asynchronous I/O support (cribbed from the * tty drivers). * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) * Jeff Uphoff : Made max number of sockets command-line * configurable. * Matti Aarnio : Made the number of sockets dynamic, * to be allocated when needed, and mr. * Uphoff's max is used as max to be * allowed to allocate. * Linus : Argh. removed all the socket allocation * altogether: it's in the inode now. * Alan Cox : Made sock_alloc()/sock_release() public * for NetROM and future kernel nfsd type * stuff. * Alan Cox : sendmsg/recvmsg basics. * Tom Dyas : Export net symbols. * Marcin Dalecki : Fixed problems with CONFIG_NET="n". * Alan Cox : Added thread locking to sys_* calls * for sockets. May have errors at the * moment. * Kevin Buhr : Fixed the dumb errors in the above. * Andi Kleen : Some small cleanups, optimizations, * and fixed a copy_from_user() bug. * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent * * This module is effectively the top level interface to the BSD socket * paradigm. * * Based upon Swansea University Computer Society NET3.039 */ #include <linux/bpf-cgroup.h> #include <linux/ethtool.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/thread_info.h> #include <linux/rcupdate.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/ptp_classify.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/cache.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h> #include <linux/wireless.h> #include <linux/nsproxy.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/nospec.h> #include <linux/indirect_call_wrapper.h> #include <linux/io_uring.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <net/compat.h> #include <net/wext.h> #include <net/cls_cgroup.h> #include <net/sock.h> #include <linux/netfilter.h> #include <linux/if_tun.h> #include <linux/ipv6_route.h> #include <linux/route.h> #include <linux/termios.h> #include <linux/sockios.h> #include <net/busy_poll.h> #include <linux/errqueue.h> #include <linux/ptp_clock_kernel.h> #include <trace/events/sock.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; #endif static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to); static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #endif static int sock_fasync(int fd, struct file *filp, int on); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); static void sock_splice_eof(struct file *file); #ifdef CONFIG_PROC_FS static void sock_show_fdinfo(struct seq_file *m, struct file *f) { struct socket *sock = f->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); if (ops->show_fdinfo) ops->show_fdinfo(m, sock); } #else #define sock_show_fdinfo NULL #endif /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .uring_cmd = io_uring_cmd_sock, .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .splice_write = splice_to_socket, .splice_read = sock_splice_read, .splice_eof = sock_splice_eof, .show_fdinfo = sock_show_fdinfo, }; static const char * const pf_family_names[] = { [PF_UNSPEC] = "PF_UNSPEC", [PF_UNIX] = "PF_UNIX/PF_LOCAL", [PF_INET] = "PF_INET", [PF_AX25] = "PF_AX25", [PF_IPX] = "PF_IPX", [PF_APPLETALK] = "PF_APPLETALK", [PF_NETROM] = "PF_NETROM", [PF_BRIDGE] = "PF_BRIDGE", [PF_ATMPVC] = "PF_ATMPVC", [PF_X25] = "PF_X25", [PF_INET6] = "PF_INET6", [PF_ROSE] = "PF_ROSE", [PF_DECnet] = "PF_DECnet", [PF_NETBEUI] = "PF_NETBEUI", [PF_SECURITY] = "PF_SECURITY", [PF_KEY] = "PF_KEY", [PF_NETLINK] = "PF_NETLINK/PF_ROUTE", [PF_PACKET] = "PF_PACKET", [PF_ASH] = "PF_ASH", [PF_ECONET] = "PF_ECONET", [PF_ATMSVC] = "PF_ATMSVC", [PF_RDS] = "PF_RDS", [PF_SNA] = "PF_SNA", [PF_IRDA] = "PF_IRDA", [PF_PPPOX] = "PF_PPPOX", [PF_WANPIPE] = "PF_WANPIPE", [PF_LLC] = "PF_LLC", [PF_IB] = "PF_IB", [PF_MPLS] = "PF_MPLS", [PF_CAN] = "PF_CAN", [PF_TIPC] = "PF_TIPC", [PF_BLUETOOTH] = "PF_BLUETOOTH", [PF_IUCV] = "PF_IUCV", [PF_RXRPC] = "PF_RXRPC", [PF_ISDN] = "PF_ISDN", [PF_PHONET] = "PF_PHONET", [PF_IEEE802154] = "PF_IEEE802154", [PF_CAIF] = "PF_CAIF", [PF_ALG] = "PF_ALG", [PF_NFC] = "PF_NFC", [PF_VSOCK] = "PF_VSOCK", [PF_KCM] = "PF_KCM", [PF_QIPCRTR] = "PF_QIPCRTR", [PF_SMC] = "PF_SMC", [PF_XDP] = "PF_XDP", [PF_MCTP] = "PF_MCTP", }; /* * The protocol list. Each protocol is registered in here. */ static DEFINE_SPINLOCK(net_family_lock); static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; /* * Support routines. * Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. */ /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space * @kaddr: Address in kernel space * @ulen: Length in user space * * The address is copied into kernel space. If the provided address is * too long an error code of -EINVAL is returned. If the copy gives * invalid addresses -EFAULT is returned. On a success 0 is returned. */ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr) { if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) return -EINVAL; if (ulen == 0) return 0; if (copy_from_user(kaddr, uaddr, ulen)) return -EFAULT; return audit_sockaddr(ulen, kaddr); } /** * move_addr_to_user - copy an address to user space * @kaddr: kernel space address * @klen: length of address in kernel * @uaddr: user space address * @ulen: pointer to user length field * * The value pointed to by ulen on entry is the buffer length available. * This is overwritten with the buffer space used. -EINVAL is returned * if an overlong buffer is specified or a negative buffer size. -EFAULT * is returned if either the buffer or the length field are not * accessible. * After copying the data up to the limit the user specifies, the true * length of the data is written over the length limit the user * specified. Zero is returned for a success. */ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { int err; int len; BUG_ON(klen > sizeof(struct sockaddr_storage)); err = get_user(len, ulen); if (err) return err; if (len > klen) len = klen; if (len < 0) return -EINVAL; if (len) { if (audit_sockaddr(klen, kaddr)) return -ENOMEM; if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } /* * "fromlen shall refer to the value before truncation.." * 1003.1g */ return __put_user(klen, ulen); } static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; ei = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; init_waitqueue_head(&ei->socket.wq.wait); ei->socket.wq.fasync_list = NULL; ei->socket.wq.flags = 0; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; ei->socket.sk = NULL; ei->socket.file = NULL; return &ei->vfs_inode; } static void sock_free_inode(struct inode *inode) { struct socket_alloc *ei; ei = container_of(inode, struct socket_alloc, vfs_inode); kmem_cache_free(sock_inode_cachep, ei); } static void init_once(void *foo) { struct socket_alloc *ei = (struct socket_alloc *)foo; inode_init_once(&ei->vfs_inode); } static void init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); BUG_ON(sock_inode_cachep == NULL); } static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .free_inode = sock_free_inode, .statfs = simple_statfs, }; /* * sockfs_dname() is called from d_path(). */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "socket:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, }; static int sockfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *suffix, void *value, size_t size) { if (value) { if (dentry->d_name.len + 1 > size) return -ERANGE; memcpy(value, dentry->d_name.name, dentry->d_name.len + 1); } return dentry->d_name.len + 1; } #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) static const struct xattr_handler sockfs_xattr_handler = { .name = XATTR_NAME_SOCKPROTONAME, .get = sockfs_xattr_get, }; static int sockfs_security_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { /* Handled by LSM. */ return -EAGAIN; } static const struct xattr_handler sockfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .set = sockfs_security_xattr_set, }; static const struct xattr_handler *sockfs_xattr_handlers[] = { &sockfs_xattr_handler, &sockfs_security_xattr_handler, NULL }; static int sockfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &sockfs_ops; ctx->dops = &sockfs_dentry_operations; ctx->xattr = sockfs_xattr_handlers; return 0; } static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", .init_fs_context = sockfs_init_fs_context, .kill_sb = kill_anon_super, }; /* * Obtains the first available file descriptor and sets it up for use. * * These functions create file structures and maps them to fd space * of the current process. On success it returns file descriptor * and file struct implicitly stored in sock->file. * Note that another thread may close file descriptor before we return * from this function. We use the fact that now we do not refer * to socket after mapping. If one day we will need it, this * function will increment ref. count on file by 1. * * In any case returned fd MAY BE not valid! * This race condition is unavoidable * with shared fd spaces, we cannot solve it inside kernel, * but we take care of internal coherence yet. */ /** * sock_alloc_file - Bind a &socket to a &file * @sock: socket * @flags: file status flags * @dname: protocol name * * Returns the &file bound with @sock, implicitly storing it * in sock->file. If dname is %NULL, sets to "". * * On failure @sock is released, and an ERR pointer is returned. * * This function uses GFP_KERNEL internally. */ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { struct file *file; if (!dname) dname = sock->sk ? sock->sk->sk_prot_creator->name : ""; file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname, O_RDWR | (flags & O_NONBLOCK), &socket_file_ops); if (IS_ERR(file)) { sock_release(sock); return file; } file->f_mode |= FMODE_NOWAIT; sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); return file; } EXPORT_SYMBOL(sock_alloc_file); static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) { sock_release(sock); return fd; } newfile = sock_alloc_file(sock, flags, NULL); if (!IS_ERR(newfile)) { fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile); } /** * sock_from_file - Return the &socket bounded to @file. * @file: file * * On failure returns %NULL. */ struct socket *sock_from_file(struct file *file) { if (file->f_op == &socket_file_ops) return file->private_data; /* set in sock_alloc_file */ return NULL; } EXPORT_SYMBOL(sock_from_file); /** * sockfd_lookup - Go from a file number to its socket slot * @fd: file handle * @err: pointer to an error code return * * The file handle passed in is locked and the socket it is bound * to is returned. If an error occurs the err pointer is overwritten * with a negative errno code and NULL is returned. The function checks * for both invalid handles and passing a handle which is not a socket. * * On a success the socket object pointer is returned. */ struct socket *sockfd_lookup(int fd, int *err) { struct file *file; struct socket *sock; file = fget(fd); if (!file) { *err = -EBADF; return NULL; } sock = sock_from_file(file); if (!sock) { *err = -ENOTSOCK; fput(file); } return sock; } EXPORT_SYMBOL(sockfd_lookup); static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) { struct fd f = fdget(fd); struct socket *sock; *err = -EBADF; if (f.file) { sock = sock_from_file(f.file); if (likely(sock)) { *fput_needed = f.flags & FDPUT_FPUT; return sock; } *err = -ENOTSOCK; fdput(f); } return NULL; } static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t len; ssize_t used = 0; len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; used += len; if (buffer) { if (size < used) return -ERANGE; buffer += len; } len = (XATTR_NAME_SOCKPROTONAME_LEN + 1); used += len; if (buffer) { if (size < used) return -ERANGE; memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len); buffer += len; } return used; } static int sockfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(&nop_mnt_idmap, dentry, iattr); if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); if (sock->sk) sock->sk->sk_uid = iattr->ia_uid; else err = -ENOENT; } return err; } static const struct inode_operations sockfs_inode_ops = { .listxattr = sockfs_listxattr, .setattr = sockfs_setattr, }; /** * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes * NULL is returned. This functions uses GFP_KERNEL internally. */ struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; return sock; } EXPORT_SYMBOL(sock_alloc); static void __sock_release(struct socket *sock, struct inode *inode) { const struct proto_ops *ops = READ_ONCE(sock->ops); if (ops) { struct module *owner = ops->owner; if (inode) inode_lock(inode); ops->release(sock); sock->sk = NULL; if (inode) inode_unlock(inode); sock->ops = NULL; module_put(owner); } if (sock->wq.fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { iput(SOCK_INODE(sock)); return; } sock->file = NULL; } /** * sock_release - close a socket * @sock: socket to close * * The socket is released from the protocol stack if it has a release * callback, and the inode is then released if the socket is bound to * an inode not a file. */ void sock_release(struct socket *sock) { __sock_release(sock, NULL); } EXPORT_SYMBOL(sock_release); void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { flags |= SKBTX_HW_TSTAMP; /* PTP hardware clocks can provide a free running cycle counter * as a time base for virtual clocks. Tell driver to use the * free running cycle counter for timestamp if socket is bound * to virtual clock. */ if (tsflags & SOF_TIMESTAMPING_BIND_PHC) flags |= SKBTX_HW_TSTAMP_USE_CYCLES; } if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, size_t)); INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *, size_t)); static noinline void call_trace_sock_send_length(struct sock *sk, int ret, int flags) { trace_sock_send_length(sk, ret, 0); } static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); if (trace_sock_send_length_enabled()) call_trace_sock_send_length(sock->sk, ret, 0); return ret; } /** * sock_sendmsg - send a message through @sock * @sock: socket * @msg: message to send * * Sends @msg through @sock, passing through LSM. * Returns the number of bytes sent, or an error code. */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { int err = security_socket_sendmsg(sock, msg, msg_data_left(msg)); return err ?: sock_sendmsg_nosec(sock, msg); } EXPORT_SYMBOL(sock_sendmsg); /** * kernel_sendmsg - send a message through @sock (kernel-space) * @sock: socket * @msg: message header * @vec: kernel vec * @num: vec array length * @size: total message data size * * Builds the message data with @vec and sends it through @sock. * Returns the number of bytes sent, or an error code. */ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); /** * kernel_sendmsg_locked - send a message through @sock (kernel-space) * @sk: sock * @msg: message header * @vec: output s/g array * @num: output s/g array length * @size: total message data size * * Builds the message data with @vec and sends it through @sock. * Returns the number of bytes sent, or an error code. * Caller must hold @sk. */ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { struct socket *sock = sk->sk_socket; const struct proto_ops *ops = READ_ONCE(sock->ops); if (!ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); return ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } EXPORT_SYMBOL(kernel_sendmsg_locked); static bool skb_is_err_queue(const struct sk_buff *skb) { /* pkt_type of skbs enqueued on the error queue are set to * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do * in recvmsg, since skbs received on a local socket will never * have a pkt_type of PACKET_OUTGOING. */ return skb->pkt_type == PACKET_OUTGOING; } /* On transmit, software and hardware timestamps are returned independently. * As the two skb clones share the hardware timestamp, which may be updated * before the software timestamp is received, a hardware TX timestamp may be * returned only if there is no software TX timestamp. Ignore false software * timestamps, which may be made in the __sock_recv_timestamp() call when the * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a * hardware timestamp. */ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) { return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); } static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) { bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); struct net_device *orig_dev; ktime_t hwtstamp; rcu_read_lock(); orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); if (orig_dev) { *if_index = orig_dev->ifindex; hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles); } else { hwtstamp = shhwtstamps->hwtstamp; } rcu_read_unlock(); return hwtstamp; } static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, int if_index) { struct scm_ts_pktinfo ts_pktinfo; struct net_device *orig_dev; if (!skb_mac_header_was_set(skb)) return; memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); if (!if_index) { rcu_read_lock(); orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); if (orig_dev) if_index = orig_dev->ifindex; rcu_read_unlock(); } ts_pktinfo.if_index = if_index; ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, sizeof(ts_pktinfo), &ts_pktinfo); } /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping_internal tss; int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); int if_index; ktime_t hwtstamp; u32 tsflags; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ if (need_software_tstamp && skb->tstamp == 0) { __net_timestamp(skb); false_tstamp = 1; } if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_sock_timeval tv; skb_get_new_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(tv), &tv); } else { struct __kernel_old_timeval tv; skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } else { if (new_tstamp) { struct __kernel_timespec ts; skb_get_new_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(ts), &ts); } else { struct __kernel_old_timespec ts; skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts), &ts); } } } memset(&tss, 0, sizeof(tss)); tsflags = READ_ONCE(sk->sk_tsflags); if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) hwtstamp = get_timestamp(sk, skb, &if_index); else hwtstamp = shhwtstamps->hwtstamp; if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, READ_ONCE(sk->sk_bind_phc)); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } } if (!empty) { if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, &tss); else put_cmsg_scm_timestamping(msg, &tss); if (skb_is_err_queue(skb) && skb->len && SKB_EXT_ERR(skb)->opt_stats) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, skb->len, skb->data); } } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); #ifdef CONFIG_WIRELESS void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int ack; if (!sock_flag(sk, SOCK_WIFI_STATUS)) return; if (!skb->wifi_acked_valid) return; ack = skb->wifi_acked; put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack); } EXPORT_SYMBOL_GPL(__sock_recv_wifi_status); #endif static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount) put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount); } static void sock_recv_mark(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { if (sock_flag(sk, SOCK_RCVMARK) && skb) { /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ __u32 mark = skb->mark; put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark); } } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); sock_recv_mark(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_cmsgs); INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, size_t, int)); INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *, size_t, int)); static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags) { trace_sock_recv_length(sk, ret, flags); } static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg, inet6_recvmsg, inet_recvmsg, sock, msg, msg_data_left(msg), flags); if (trace_sock_recv_length_enabled()) call_trace_sock_recv_length(sock->sk, ret, flags); return ret; } /** * sock_recvmsg - receive a message from @sock * @sock: socket * @msg: message to receive * @flags: message flags * * Receives @msg from @sock, passing through LSM. Returns the total number * of bytes received, or an error. */ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) { int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); return err ?: sock_recvmsg_nosec(sock, msg, flags); } EXPORT_SYMBOL(sock_recvmsg); /** * kernel_recvmsg - Receive a message from a socket (kernel space) * @sock: The socket to receive the message from * @msg: Received message * @vec: Input s/g array for message data * @num: Size of input s/g array * @size: Number of bytes to read * @flags: Message flags (MSG_DONTWAIT, etc...) * * On return the msg structure contains the scatter/gather array passed in the * vec argument. The array is modified so that it consists of the unfilled * portion of the original array. * * The returned value is the total number of bytes received, or an error. */ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size); return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct socket *sock = file->private_data; const struct proto_ops *ops; ops = READ_ONCE(sock->ops); if (unlikely(!ops->splice_read)) return copy_splice_read(file, ppos, pipe, len, flags); return ops->splice_read(sock, ppos, pipe, len, flags); } static void sock_splice_eof(struct file *file) { struct socket *sock = file->private_data; const struct proto_ops *ops; ops = READ_ONCE(sock->ops); if (ops->splice_eof) ops->splice_eof(sock); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *to, .msg_iocb = iocb}; ssize_t res; if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (iocb->ki_pos != 0) return -ESPIPE; if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; res = sock_recvmsg(sock, &msg, msg.msg_flags); *to = msg.msg_iter; return res; } static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *from, .msg_iocb = iocb}; ssize_t res; if (iocb->ki_pos != 0) return -ESPIPE; if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT)) msg.msg_flags = MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } /* * Atomic setting of ioctl hooks to avoid race * with module unload. */ static DEFINE_MUTEX(br_ioctl_mutex); static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg); void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; mutex_unlock(&br_ioctl_mutex); } EXPORT_SYMBOL(brioctl_set); int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg) { int err = -ENOPKG; if (!br_ioctl_hook) request_module("bridge"); mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) err = br_ioctl_hook(net, br, cmd, ifr, uarg); mutex_unlock(&br_ioctl_mutex); return err; } static DEFINE_MUTEX(vlan_ioctl_mutex); static int (*vlan_ioctl_hook) (struct net *, void __user *arg); void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) { mutex_lock(&vlan_ioctl_mutex); vlan_ioctl_hook = hook; mutex_unlock(&vlan_ioctl_mutex); } EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { const struct proto_ops *ops = READ_ONCE(sock->ops); struct ifreq ifr; bool need_copyout; int err; void __user *argp = (void __user *)arg; void __user *data; err = ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down * to the NIC driver. */ if (err != -ENOIOCTLCMD) return err; if (!is_socket_ioctl_cmd(cmd)) return -ENOTTY; if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) if (put_user_ifreq(&ifr, argp)) return -EFAULT; return err; } /* * With an ioctl, arg may well be a user mode pointer, but we don't know * what to do with it - that's up to the protocol still. */ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { const struct proto_ops *ops; struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; int pid, err; struct net *net; sock = file->private_data; ops = READ_ONCE(sock->ops); sk = sock->sk; net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { struct ifreq ifr; void __user *data; bool need_copyout; if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) if (put_user_ifreq(&ifr, argp)) return -EFAULT; } else #ifdef CONFIG_WEXT_CORE if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { err = wext_handle_ioctl(net, cmd, argp); } else #endif switch (cmd) { case FIOSETOWN: case SIOCSPGRP: err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; err = f_setown(sock->file, pid, 1); break; case FIOGETOWN: case SIOCGPGRP: err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: err = br_ioctl_call(net, NULL, cmd, NULL, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: err = -ENOPKG; if (!vlan_ioctl_hook) request_module("8021q"); mutex_lock(&vlan_ioctl_mutex); if (vlan_ioctl_hook) err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; case SIOCGSKNS: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = open_related_ns(&net->ns, get_net_ns); break; case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !IS_ENABLED(CONFIG_64BIT)); break; case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } err = ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_NEW, false); break; case SIOCGIFCONF: err = dev_ifconf(net, argp); break; default: err = sock_do_ioctl(net, sock, cmd, arg); break; } return err; } /** * sock_create_lite - creates a socket * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * Creates a new socket and assigns it to @res, passing through LSM. * The new socket initialization is not complete, see kernel_accept(). * Returns 0 or an error. On failure @res is set to %NULL. * This function internally uses GFP_KERNEL. */ int sock_create_lite(int family, int type, int protocol, struct socket **res) { int err; struct socket *sock = NULL; err = security_socket_create(family, type, protocol, 1); if (err) goto out; sock = sock_alloc(); if (!sock) { err = -ENOMEM; goto out; } sock->type = type; err = security_socket_post_create(sock, family, type, protocol, 1); if (err) goto out_release; out: *res = sock; return err; out_release: sock_release(sock); sock = NULL; goto out; } EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); __poll_t events = poll_requested_events(wait), flag = 0; if (!ops->poll) return 0; if (sk_can_busy_loop(sock->sk)) { /* poll once if requested by the syscall */ if (events & POLL_BUSY_LOOP) sk_busy_loop(sock->sk, 1); /* if this socket can poll_ll, tell the system call */ flag = POLL_BUSY_LOOP; } return ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; return READ_ONCE(sock->ops)->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) { __sock_release(SOCKET_I(inode), inode); return 0; } /* * Update the socket async list * * Fasync_list locking strategy. * * 1. fasync_list is modified only under process context socket lock * i.e. under semaphore. * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) * or under socket lock */ static int sock_fasync(int fd, struct file *filp, int on) { struct socket *sock = filp->private_data; struct sock *sk = sock->sk; struct socket_wq *wq = &sock->wq; if (sk == NULL) return -EINVAL; lock_sock(sk); fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) sock_reset_flag(sk, SOCK_FASYNC); else sock_set_flag(sk, SOCK_FASYNC); release_sock(sk); return 0; } /* This function may be called only under rcu_lock */ int sock_wake_async(struct socket_wq *wq, int how, int band) { if (!wq || !wq->fasync_list) return -1; switch (how) { case SOCK_WAKE_WAITD: if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; fallthrough; case SOCK_WAKE_IO: call_kill: kill_fasync(&wq->fasync_list, SIGIO, band); break; case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } return 0; } EXPORT_SYMBOL(sock_wake_async); /** * __sock_create - creates a socket * @net: net namespace * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * @kern: boolean for kernel space sockets * * Creates a new socket and assigns it to @res, passing through LSM. * Returns 0 or an error. On failure @res is set to %NULL. @kern must * be set to true if the socket resides in kernel space. * This function internally uses GFP_KERNEL. */ int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; /* * Check protocol is in range */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ if (family == PF_INET && type == SOCK_PACKET) { pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ } sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! */ if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ if (!try_module_get(sock->ops->owner)) goto out_module_busy; /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; out_release: rcu_read_unlock(); goto out_sock_release; } EXPORT_SYMBOL(__sock_create); /** * sock_create - creates a socket * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * A wrapper around __sock_create(). * Returns 0 or an error. This function internally uses GFP_KERNEL. */ int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create); /** * sock_create_kern - creates a socket (kernel space) * @net: net namespace * @family: protocol family (AF_INET, ...) * @type: communication type (SOCK_STREAM, ...) * @protocol: protocol (0, ...) * @res: new socket * * A wrapper around __sock_create(). * Returns 0 or an error. This function internally uses GFP_KERNEL. */ int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); static struct socket *__sys_socket_create(int family, int type, int protocol) { struct socket *sock; int retval; /* Check the SOCK_* constants for consistency. */ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return ERR_PTR(-EINVAL); type &= SOCK_TYPE_MASK; retval = sock_create(family, type, protocol, &sock); if (retval < 0) return ERR_PTR(retval); return sock; } struct file *__sys_socket_file(int family, int type, int protocol) { struct socket *sock; int flags; sock = __sys_socket_create(family, type, protocol); if (IS_ERR(sock)) return ERR_CAST(sock); flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_alloc_file(sock, flags, NULL); } /* A hook for bpf progs to attach to and update socket protocol. * * A static noinline declaration here could cause the compiler to * optimize away the function. A global noinline declaration will * keep the definition, but may optimize away the callsite. * Therefore, __weak is needed to ensure that the call is still * emitted, by telling the compiler that we don't know what the * function might eventually be. * * __diag_* below are needed to dismiss the missing prototype warning. */ __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "A fmod_ret entry point for BPF programs"); __weak noinline int update_socket_protocol(int family, int type, int protocol) { return protocol; } __diag_pop(); int __sys_socket(int family, int type, int protocol) { struct socket *sock; int flags; sock = __sys_socket_create(family, type, update_socket_protocol(family, type, protocol)); if (IS_ERR(sock)) return PTR_ERR(sock); flags = type & ~SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); } SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { return __sys_socket(family, type, protocol); } /* * Create a pair of connected sockets. */ int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) { struct socket *sock1, *sock2; int fd1, fd2, err; struct file *newfile1, *newfile2; int flags; flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; /* * reserve descriptors and make sure we won't fail * to return them to userland. */ fd1 = get_unused_fd_flags(flags); if (unlikely(fd1 < 0)) return fd1; fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { put_unused_fd(fd1); return fd2; } err = put_user(fd1, &usockvec[0]); if (err) goto out; err = put_user(fd2, &usockvec[1]); if (err) goto out; /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); if (unlikely(err < 0)) goto out; err = sock_create(family, type, protocol, &sock2); if (unlikely(err < 0)) { sock_release(sock1); goto out; } err = security_socket_socketpair(sock1, sock2); if (unlikely(err)) { sock_release(sock2); sock_release(sock1); goto out; } err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2); if (unlikely(err < 0)) { sock_release(sock2); sock_release(sock1); goto out; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); sock_release(sock2); goto out; } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { err = PTR_ERR(newfile2); fput(newfile1); goto out; } audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); return 0; out: put_unused_fd(fd2); put_unused_fd(fd1); return err; } SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, int __user *, usockvec) { return __sys_socketpair(family, type, protocol, usockvec); } /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. * * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); if (!err) { err = security_socket_bind(sock, (struct sockaddr *)&address, addrlen); if (!err) err = READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *) &address, addrlen); } fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { return __sys_bind(fd, umyaddr, addrlen); } /* * Perform a listen. Basically, we allow the protocol to do anything * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ int __sys_listen(int fd, int backlog) { struct socket *sock; int err, fput_needed; int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; err = security_socket_listen(sock, backlog); if (!err) err = READ_ONCE(sock->ops)->listen(sock, backlog); fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE2(listen, int, fd, int, backlog) { return __sys_listen(fd, backlog); } struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; int err, len; struct sockaddr_storage address; const struct proto_ops *ops; sock = sock_from_file(file); if (!sock) return ERR_PTR(-ENOTSOCK); newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); ops = READ_ONCE(sock->ops); newsock->type = sock->type; newsock->ops = ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ __module_get(ops->owner); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) return newfile; err = security_socket_accept(sock, newsock); if (err) goto out_fd; err = ops->accept(sock, newsock, sock->file->f_flags | file_flags, false); if (err < 0) goto out_fd; if (upeer_sockaddr) { len = ops->getname(newsock, (struct sockaddr *)&address, 2); if (len < 0) { err = -ECONNABORTED; goto out_fd; } err = move_addr_to_user(&address, len, upeer_sockaddr, upeer_addrlen); if (err < 0) goto out_fd; } /* File flags are not inherited via accept() unlike another OSes. */ return newfile; out_fd: fput(newfile); return ERR_PTR(err); } static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { struct file *newfile; int newfd; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; newfd = get_unused_fd_flags(flags); if (unlikely(newfd < 0)) return newfd; newfile = do_accept(file, 0, upeer_sockaddr, upeer_addrlen, flags); if (IS_ERR(newfile)) { put_unused_fd(newfd); return PTR_ERR(newfile); } fd_install(newfd, newfile); return newfd; } /* * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new * connected fd. We collect the address of the connector in kernel * space and move it to user at the very end. This is unclean because * we open the socket then return an error. * * 1003.1g adds the ability to recvmsg() to query connection pending * status to recvmsg. We need to add that support in a way thats * clean when we restructure accept also. */ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { int ret = -EBADF; struct fd f; f = fdget(fd); if (f.file) { ret = __sys_accept4_file(f.file, upeer_sockaddr, upeer_addrlen, flags); fdput(f); } return ret; } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags); } SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } /* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. * * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to * break bindings * * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and * other SEQPACKET protocols that take time to connect() as it doesn't * include the -EINPROGRESS status for such sockets. */ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, int addrlen, int file_flags) { struct socket *sock; int err; sock = sock_from_file(file); if (!sock) { err = -ENOTSOCK; goto out; } err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); if (err) goto out; err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, addrlen, sock->file->f_flags | file_flags); out: return err; } int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { int ret = -EBADF; struct fd f; f = fdget(fd); if (f.file) { struct sockaddr_storage address; ret = move_addr_to_kernel(uservaddr, addrlen, &address); if (!ret) ret = __sys_connect_file(f.file, &address, addrlen, 0); fdput(f); } return ret; } SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { return __sys_connect(fd, uservaddr, addrlen); } /* * Get the local address ('name') of a socket object. Move the obtained * name to user space. */ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = security_socket_getsockname(sock); if (err) goto out_put; err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); if (err < 0) goto out_put; /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); out_put: fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getsockname(fd, usockaddr, usockaddr_len); } /* * Get the remote address ('name') of a socket object. Move the obtained * name to user space. */ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { const struct proto_ops *ops = READ_ONCE(sock->ops); err = security_socket_getpeername(sock); if (err) { fput_light(sock->file, fput_needed); return err; } err = ops->getname(sock, (struct sockaddr *)&address, 1); if (err >= 0) /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { return __sys_getpeername(fd, usockaddr, usockaddr_len); } /* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; struct iovec iov; int fput_needed; err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_ubuf = NULL; if (addr) { err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; err = sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len) { return __sys_sendto(fd, buff, len, flags, addr, addr_len); } /* * Send a datagram down a socket. */ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned int, flags) { return __sys_sendto(fd, buff, len, flags, NULL, 0); } /* * Receive a frame from the socket and optionally record the address of the * sender. We verify the buffers are writable and if needed move the * sender address from kernel to user space. */ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len) { struct sockaddr_storage address; struct msghdr msg = { /* Save some cycles and don't copy the address if not needed */ .msg_name = addr ? (struct sockaddr *)&address : NULL, }; struct socket *sock; struct iovec iov; int err, err2; int fput_needed; err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, msg.msg_namelen, addr, addr_len); if (err2 < 0) err = err2; } fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags, struct sockaddr __user *, addr, int __user *, addr_len) { return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len); } /* * Receive a datagram from a socket. */ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags) { return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } static bool sock_use_custom_sol_socket(const struct socket *sock) { return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); } /* * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { sockptr_t optval = USER_SOCKPTR(user_optval); const struct proto_ops *ops; char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; if (optlen < 0) return -EINVAL; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; err = security_socket_setsockopt(sock, level, optname); if (err) goto out_put; if (!in_compat_syscall()) err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, user_optval, &optlen, &kernel_optval); if (err < 0) goto out_put; if (err > 0) { err = 0; goto out_put; } if (kernel_optval) optval = KERNEL_SOCKPTR(kernel_optval); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); else if (unlikely(!ops->setsockopt)) err = -EOPNOTSUPP; else err = ops->setsockopt(sock, level, optname, optval, optlen); kfree(kernel_optval); out_put: fput_light(sock->file, fput_needed); return err; } SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, char __user *, optval, int, optlen) { return __sys_setsockopt(fd, level, optname, optval, optlen); } INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, int optname)); /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { int max_optlen __maybe_unused; const struct proto_ops *ops; int err, fput_needed; struct socket *sock; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; err = security_socket_getsockopt(sock, level, optname); if (err) goto out_put; if (!in_compat_syscall()) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); else if (unlikely(!ops->getsockopt)) err = -EOPNOTSUPP; else err = ops->getsockopt(sock, level, optname, optval, optlen); if (!in_compat_syscall()) err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, optval, optlen, max_optlen, err); out_put: fput_light(sock->file, fput_needed); return err; } SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, char __user *, optval, int __user *, optlen) { return __sys_getsockopt(fd, level, optname, optval, optlen); } /* * Shutdown a socket. */ int __sys_shutdown_sock(struct socket *sock, int how) { int err; err = security_socket_shutdown(sock, how); if (!err) err = READ_ONCE(sock->ops)->shutdown(sock, how); return err; } int __sys_shutdown(int fd, int how) { int err, fput_needed; struct socket *sock; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { err = __sys_shutdown_sock(sock, how); fput_light(sock->file, fput_needed); } return err; } SYSCALL_DEFINE2(shutdown, int, fd, int, how) { return __sys_shutdown(fd, how); } /* A couple of helpful macros for getting the address of the 32/64 bit * fields which are the same type (int / unsigned) on our platforms. */ #define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) struct used_address { struct sockaddr_storage name; unsigned int name_len; }; int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *msg, struct sockaddr __user **save_addr) { ssize_t err; kmsg->msg_control_is_user = true; kmsg->msg_get_inq = 0; kmsg->msg_control_user = msg->msg_control; kmsg->msg_controllen = msg->msg_controllen; kmsg->msg_flags = msg->msg_flags; kmsg->msg_namelen = msg->msg_namelen; if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) return -EINVAL; if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) *save_addr = msg->msg_name; if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { err = move_addr_to_kernel(msg->msg_name, kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; } } else { kmsg->msg_name = NULL; kmsg->msg_namelen = 0; } if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; kmsg->msg_ubuf = NULL; return 0; } static int copy_msghdr_from_user(struct msghdr *kmsg, struct user_msghdr __user *umsg, struct sockaddr __user **save_addr, struct iovec **iov) { struct user_msghdr msg; ssize_t err; if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; err = __copy_msghdr(kmsg, &msg, save_addr); if (err) return err; err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, unsigned int allowed_msghdr_flags) { unsigned char ctl[sizeof(struct cmsghdr) + 20] __aligned(sizeof(__kernel_size_t)); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; int ctl_len; ssize_t err; err = -ENOBUFS; if (msg_sys->msg_controllen > INT_MAX) goto out; flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) goto out; ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { BUILD_BUG_ON(sizeof(struct cmsghdr) != CMSG_ALIGN(sizeof(struct cmsghdr))); if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) goto out; } err = -EFAULT; if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; msg_sys->msg_control_is_user = false; } flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) msg_sys->msg_flags |= MSG_DONTWAIT; /* * If this is sendmmsg() and current destination address is same as * previously succeeded address, omit asking LSM's decision. * used_address->name_len is initialized to UINT_MAX so that the first * destination address never matches. */ if (used_address && msg_sys->msg_name && used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } err = sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. */ if (used_address && err >= 0) { used_address->name_len = msg_sys->msg_namelen; if (msg_sys->msg_name) memcpy(&used_address->name, msg_sys->msg_name, used_address->name_len); } out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out: return err; } int sendmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct iovec **iov) { int err; if (flags & MSG_CMSG_COMPAT) { struct compat_msghdr __user *msg_compat; msg_compat = (struct compat_msghdr __user *) umsg; err = get_compat_msghdr(msg, msg_compat, NULL, iov); } else { err = copy_msghdr_from_user(msg, umsg, NULL, iov); } if (err < 0) return err; return 0; } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, unsigned int allowed_msghdr_flags) { struct sockaddr_storage address; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; ssize_t err; msg_sys->msg_name = &address; err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov); if (err < 0) return err; err = ____sys_sendmsg(sock, msg_sys, flags, used_address, allowed_msghdr_flags); kfree(iov); return err; } /* * BSD sendmsg interface */ long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags) { return ____sys_sendmsg(sock, msg, flags, NULL, 0); } long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { return __sys_sendmsg(fd, msg, flags, true); } /* * Linux sendmmsg interface */ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct used_address used_address; unsigned int oflags = flags; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; datagrams = 0; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; used_address.name_len = UINT_MAX; entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; flags |= MSG_BATCH; while (datagrams < vlen) { if (datagrams == vlen - 1) flags = oflags; if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)entry, &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) break; ++datagrams; if (msg_data_left(&msg_sys)) break; cond_resched(); } fput_light(sock->file, fput_needed); /* We only return an error if no datagrams were able to be sent */ if (datagrams != 0) return datagrams; return err; } SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov) { ssize_t err; if (MSG_CMSG_COMPAT & flags) { struct compat_msghdr __user *msg_compat; msg_compat = (struct compat_msghdr __user *) umsg; err = get_compat_msghdr(msg, msg_compat, uaddr, iov); } else { err = copy_msghdr_from_user(msg, umsg, uaddr, iov); } if (err < 0) return err; return 0; } static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys, struct user_msghdr __user *msg, struct sockaddr __user *uaddr, unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *) msg; int __user *uaddr_len = COMPAT_NAMELEN(msg); struct sockaddr_storage addr; unsigned long cmsg_ptr; int len; ssize_t err; msg_sys->msg_name = &addr; cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); /* We assume all kernel code knows the size of sockaddr_storage */ msg_sys->msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; if (unlikely(nosec)) err = sock_recvmsg_nosec(sock, msg_sys, flags); else err = sock_recvmsg(sock, msg_sys, flags); if (err < 0) goto out; len = err; if (uaddr != NULL) { err = move_addr_to_user(&addr, msg_sys->msg_namelen, uaddr, uaddr_len); if (err < 0) goto out; } err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(msg)); if (err) goto out; if (MSG_CMSG_COMPAT & flags) err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg_compat->msg_controllen); else err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg->msg_controllen); if (err) goto out; err = len; out: return err; } static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) { struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; /* user mode address pointers */ struct sockaddr __user *uaddr; ssize_t err; err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov); if (err < 0) return err; err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec); kfree(iov); return err; } /* * BSD recvmsg interface */ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); fput_light(sock->file, fput_needed); out: return err; } SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { return __sys_recvmsg(fd, msg, flags, true); } /* * Linux recvmmsg interface */ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct timespec64 end_time; struct timespec64 timeout64; if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, timeout->tv_nsec)) return -EINVAL; datagrams = 0; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; if (likely(!(flags & MSG_ERRQUEUE))) { err = sock_error(sock->sk); if (err) { datagrams = err; goto out_put; } } entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; while (datagrams < vlen) { /* * No need to ask LSM for more than the first datagram. */ if (MSG_CMSG_COMPAT & flags) { err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_recvmsg(sock, (struct user_msghdr __user *)entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) break; ++datagrams; /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */ if (flags & MSG_WAITFORONE) flags |= MSG_DONTWAIT; if (timeout) { ktime_get_ts64(&timeout64); *timeout = timespec64_sub(end_time, timeout64); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; } /* Timeout, return less than vlen datagrams */ if (timeout->tv_nsec == 0 && timeout->tv_sec == 0) break; } /* Out of band data, return right away */ if (msg_sys.msg_flags & MSG_OOB) break; cond_resched(); } if (err == 0) goto out_put; if (datagrams == 0) { datagrams = err; goto out_put; } /* * We may return less entries than requested (vlen) if the * sock is non block and there aren't enough datagrams... */ if (err != -EAGAIN) { /* * ... or if recvmsg returns an error after we * received some datagrams, where we record the * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ WRITE_ONCE(sock->sk->sk_err, -err); } out_put: fput_light(sock->file, fput_needed); return datagrams; } int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32) { int datagrams; struct timespec64 timeout_sys; if (timeout && get_timespec64(&timeout_sys, timeout)) return -EFAULT; if (timeout32 && get_old_timespec32(&timeout_sys, timeout32)) return -EFAULT; if (!timeout && !timeout32) return do_recvmmsg(fd, mmsg, vlen, flags, NULL); datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); if (datagrams <= 0) return datagrams; if (timeout && put_timespec64(&timeout_sys, timeout)) datagrams = -EFAULT; if (timeout32 && put_old_timespec32(&timeout_sys, timeout32)) datagrams = -EFAULT; return datagrams; } SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct __kernel_timespec __user *, timeout) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL); } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout); } #endif #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static const unsigned char nargs[21] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), AL(4), AL(5), AL(4) }; #undef AL /* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn't need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; call = array_index_nospec(call, SYS_SENDMMSG + 1); len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = __sys_listen(a0, a1); break; case SYS_ACCEPT: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = __sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], NULL, 0); break; case SYS_SENDTO: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], NULL, NULL); break; case SYS_RECVFROM: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = __sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_SENDMMSG: err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], true); break; case SYS_RECVMSG: err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_RECVMMSG: if (IS_ENABLED(CONFIG_64BIT)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], NULL); else err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], NULL, (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } #endif /* __ARCH_WANT_SYS_SOCKETCALL */ /** * sock_register - add a socket protocol handler * @ops: description of protocol * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * socket interface. The value ops->family corresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (rcu_dereference_protected(net_families[ops->family], lockdep_is_held(&net_family_lock))) err = -EEXIST; else { rcu_assign_pointer(net_families[ops->family], ops); err = 0; } spin_unlock(&net_family_lock); pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]); return err; } EXPORT_SYMBOL(sock_register); /** * sock_unregister - remove a protocol handler * @family: protocol family to remove * * This function is called by a protocol handler that wants to * remove its address family, and have it unlinked from the * new socket creation. * * If protocol handler is a module, then it can use module reference * counts to protect against new references. If protocol handler is not * a module then it needs to provide its own protection in * the ops->create routine. */ void sock_unregister(int family) { BUG_ON(family < 0 || family >= NPROTO); spin_lock(&net_family_lock); RCU_INIT_POINTER(net_families[family], NULL); spin_unlock(&net_family_lock); synchronize_rcu(); pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]); } EXPORT_SYMBOL(sock_unregister); bool sock_is_registered(int family) { return family < NPROTO && rcu_access_pointer(net_families[family]); } static int __init sock_init(void) { int err; /* * Initialize the network sysctl infrastructure. */ err = net_sysctl_init(); if (err) goto out; /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); err = register_filesystem(&sock_fs_type); if (err) goto out; sock_mnt = kern_mount(&sock_fs_type); if (IS_ERR(sock_mnt)) { err = PTR_ERR(sock_mnt); goto out_mount; } /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER err = netfilter_init(); if (err) goto out; #endif ptp_classifier_init(); out: return err; out_mount: unregister_filesystem(&sock_fs_type); goto out; } core_initcall(sock_init); /* early initcall */ #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { seq_printf(seq, "sockets: used %d\n", sock_inuse_get(seq->private)); } #endif /* CONFIG_PROC_FS */ /* Handle the fact that while struct ifreq has the same *layout* on * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, * which are handled elsewhere, it still has different *size* due to * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, * resulting in struct ifreq being 32 and 40 bytes respectively). * As a result, if the struct happens to be at the end of a page and * the next page isn't readable/writable, we get a fault. To prevent * that, copy back and forth to the full size. */ int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg) { if (in_compat_syscall()) { struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr; memset(ifr, 0, sizeof(*ifr)); if (copy_from_user(ifr32, arg, sizeof(*ifr32))) return -EFAULT; if (ifrdata) *ifrdata = compat_ptr(ifr32->ifr_data); return 0; } if (copy_from_user(ifr, arg, sizeof(*ifr))) return -EFAULT; if (ifrdata) *ifrdata = ifr->ifr_data; return 0; } EXPORT_SYMBOL(get_user_ifreq); int put_user_ifreq(struct ifreq *ifr, void __user *arg) { size_t size = sizeof(*ifr); if (in_compat_syscall()) size = sizeof(struct compat_ifreq); if (copy_to_user(arg, ifr, size)) return -EFAULT; return 0; } EXPORT_SYMBOL(put_user_ifreq); #ifdef CONFIG_COMPAT static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { compat_uptr_t uptr32; struct ifreq ifr; void __user *saved; int err; if (get_user_ifreq(&ifr, NULL, uifr32)) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) return -EFAULT; saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL); if (!err) { ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; if (put_user_ifreq(&ifr, uifr32)) err = -EFAULT; } return err; } /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { struct ifreq ifreq; void __user *data; if (!is_socket_ioctl_cmd(cmd)) return -ENOTTY; if (get_user_ifreq(&ifreq, &data, u_ifreq32)) return -EFAULT; ifreq.ifr_data = data; return dev_ioctl(net, cmd, &ifreq, data, NULL); } static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); const struct proto_ops *ops; if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return sock_ioctl(file, cmd, (unsigned long)argp); switch (cmd) { case SIOCWANDEV: return compat_siocwandev(net, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: ops = READ_ONCE(sock->ops); if (!ops->gettstamp) return -ENOIOCTLCMD; return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: case SIOCGHWTSTAMP: return compat_ifr_data_ioctl(net, cmd, argp); case FIOSETOWN: case SIOCSPGRP: case FIOGETOWN: case SIOCGPGRP: case SIOCBRADDBR: case SIOCBRDELBR: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: case SIOCGIFCONF: case SIOCSIFBR: case SIOCGIFBR: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: case SIOCSIFFLAGS: case SIOCGIFMAP: case SIOCSIFMAP: case SIOCGIFMETRIC: case SIOCSIFMETRIC: case SIOCGIFMTU: case SIOCSIFMTU: case SIOCGIFMEM: case SIOCSIFMEM: case SIOCGIFHWADDR: case SIOCSIFHWADDR: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFINDEX: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCSIFHWBROADCAST: case SIOCDIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: case SIOCBRADDIF: case SIOCBRDELIF: case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCSARP: case SIOCGARP: case SIOCDARP: case SIOCOUTQ: case SIOCOUTQNSD: case SIOCATMARK: return sock_do_ioctl(net, sock, cmd, arg); } return -ENOIOCTLCMD; } static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; const struct proto_ops *ops = READ_ONCE(sock->ops); int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; sk = sock->sk; net = sock_net(sk); if (ops->compat_ioctl) ret = ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) ret = compat_wext_handle_ioctl(net, cmd, arg); if (ret == -ENOIOCTLCMD) ret = compat_sock_ioctl_trans(file, sock, cmd, arg); return ret; } #endif /** * kernel_bind - bind an address to a socket (kernel space) * @sock: socket * @addr: address * @addrlen: length of address * * Returns 0 or an error. */ int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { return READ_ONCE(sock->ops)->bind(sock, addr, addrlen); } EXPORT_SYMBOL(kernel_bind); /** * kernel_listen - move socket to listening state (kernel space) * @sock: socket * @backlog: pending connections queue size * * Returns 0 or an error. */ int kernel_listen(struct socket *sock, int backlog) { return READ_ONCE(sock->ops)->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); /** * kernel_accept - accept a connection (kernel space) * @sock: listening socket * @newsock: new connected socket * @flags: flags * * @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0. * If it fails, @newsock is guaranteed to be %NULL. * Returns 0 or an error. */ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; const struct proto_ops *ops = READ_ONCE(sock->ops); int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, newsock); if (err < 0) goto done; err = ops->accept(sock, *newsock, flags, true); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } (*newsock)->ops = ops; __module_get(ops->owner); done: return err; } EXPORT_SYMBOL(kernel_accept); /** * kernel_connect - connect a socket (kernel space) * @sock: socket * @addr: address * @addrlen: address length * @flags: flags (O_NONBLOCK, ...) * * For datagram sockets, @addr is the address to which datagrams are sent * by default, and the only address from which datagrams are received. * For stream sockets, attempts to connect to @addr. * Returns 0 or an error code. */ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); /** * kernel_getsockname - get the address which the socket is bound (kernel space) * @sock: socket * @addr: address holder * * Fills the @addr pointer with the address which the socket is bound. * Returns the length of the address in bytes or an error code. */ int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { return READ_ONCE(sock->ops)->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); /** * kernel_getpeername - get the address which the socket is connected (kernel space) * @sock: socket * @addr: address holder * * Fills the @addr pointer with the address which the socket is connected. * Returns the length of the address in bytes or an error code. */ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { return READ_ONCE(sock->ops)->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); /** * kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space) * @sock: socket * @how: connection part * * Returns 0 or an error. */ int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { return READ_ONCE(sock->ops)->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); /** * kernel_sock_ip_overhead - returns the IP overhead imposed by a socket * @sk: socket * * This routine returns the IP overhead imposed by a socket i.e. * the length of the underlying IP header, depending on whether * this is an IPv4 or IPv6 socket and the length from IP options turned * on at the socket. Assumes that the caller has a lock on the socket. */ u32 kernel_sock_ip_overhead(struct sock *sk) { struct inet_sock *inet; struct ip_options_rcu *opt; u32 overhead = 0; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *np; struct ipv6_txoptions *optv6 = NULL; #endif /* IS_ENABLED(CONFIG_IPV6) */ if (!sk) return overhead; switch (sk->sk_family) { case AF_INET: inet = inet_sk(sk); overhead += sizeof(struct iphdr); opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (opt) overhead += opt->opt.optlen; return overhead; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: np = inet6_sk(sk); overhead += sizeof(struct ipv6hdr); if (np) optv6 = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); if (optv6) overhead += (optv6->opt_flen + optv6->opt_nflen); return overhead; #endif /* IS_ENABLED(CONFIG_IPV6) */ default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */ return overhead; } } EXPORT_SYMBOL(kernel_sock_ip_overhead); |
| 45 1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 36 36 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, TCP_CMSG_TS = 2 }; DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif /* * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(tcp_sockets_allocated); /* * TCP splice context */ struct tcp_splice_state { struct pipe_inode_info *pipe; size_t len; unsigned int flags; }; /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; if (!val) val--; if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { u8 res = 0; if (seconds > 0) { int period = timeout; res = 1; while (seconds > period && res < 255) { res++; timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return res; } /* Convert retransmits to seconds based on initial and max timeout */ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) { int period = 0; if (retrans > 0) { period = timeout; while (--retrans) { timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return period; } static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) { u32 rate = READ_ONCE(tp->rate_delivered); u32 intv = READ_ONCE(tp->rate_interval_us); u64 rate64 = 0; if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); } return rate64; } /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack = 1; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } } static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; return sk_is_readable(sk); } /* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case. */ mask = 0; /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side. */ smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else mask |= EPOLLOUT | EPOLLWRNORM; if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; slow = lock_sock_fast(sk); answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } *karg = answ; return 0; } EXPORT_SYMBOL(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; } /* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags); if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. */ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return; } if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) { /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); } /** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. * **/ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; long timeo; ssize_t spliced; int ret; sock_rps_record_flow(sk); /* * We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; else if (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data. */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (!tss.len || !timeo) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } release_sock(sk); if (spliced) return spliced; return ret; } EXPORT_SYMBOL(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; if (unlikely(new_size_goal < size_goal || new_size_goal >= size_goal + mss_now)) { tp->gso_segs = min_t(u16, new_size_goal / mss_now, sk->sk_gso_max_segs); size_goal = tp->gso_segs * mss_now; } return max(size_goal, mss_now); } int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; mss_now = tcp_current_mss(sk); *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); return mss_now; } /* In some cases, both sendmsg() could have added an skb to the write queue, * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger * epoll() users. */ void tcp_remove_empty_skb(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_wmem_free_skb(sk, skb); } } /* skb changing from pure zc to mixed, must charge zc */ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) { if (unlikely(skb_zcopy_pure(skb))) { u32 extra = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); if (!sk_wmem_schedule(sk, extra)) return -ENOMEM; sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } return 0; } int tcp_wmem_schedule(struct sock *sk, int copy) { int left; if (likely(sk_wmem_schedule(sk, copy))) return copy; /* We could be in trouble if we have nothing queued. * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress. */ left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; if (left > 0) sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc); } void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { tcp_set_state(sk, TCP_CLOSE); inet->inet_dport = 0; sk->sk_route_caps = 0; } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); } return err; } int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; int zc = 0; long timeo; flags = msg->msg_flags; if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { err = -EINVAL; goto out_err; } } /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { ssize_t copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; if (sk_flush_backlog(sk)) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; skb_zcopy_downgrade_managed(skb); } copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); page_ref_inc(pfrag->page); } pfrag->offset += copy; } else if (zc == MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb */ if (!skb->len) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; } if (err < 0) goto do_error; copy = err; } else if (zc == MSG_SPLICE_PAGES) { /* Splice in data if we can; copy if we can't. */ if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) { if (err == -EMSGSIZE) { tcp_mark_push(tp, skb); goto new_segment; } goto do_error; } copy = err; if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); return copied + copied_syn; do_error: tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; out_err: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; lock_sock(sk); ret = tcp_sendmsg_locked(sk, msg, size); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendmsg); void tcp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; if (!tcp_write_queue_tail(sk)) return; lock_sock(sk); mss_now = tcp_send_mss(sk, &size_goal, 0); tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } EXPORT_SYMBOL_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); /* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; return err ? -EFAULT : len; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0; /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de> */ return -EAGAIN; } static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) { struct sk_buff *skb; int copied = 0, err = 0; /* XXX -- need to support SO_PEEK_OFF */ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) return err; copied += skb->len; } skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; copied += skb->len; } return err ?: copied; } /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update. */ void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { __u32 new_window = __tcp_select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) time_to_ack = true; } } if (time_to_ack) tcp_send_ack(sk); } void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tcp_sock *tp = tcp_sk(sk); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); __tcp_cleanup_rbuf(sk, copied); } static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); } __kfree_skb(skb); } struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ tcp_eat_recv_skb(sk, skb); } return NULL; } EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; u32 offset; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used; size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied) copied = used; break; } if (WARN_ON_ONCE(used > len)) used = len; seq += used; copied += used; offset += used; /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags */ if (offset + 1 != skb->len) continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(tp->copied_seq, seq); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } return copied; } EXPORT_SYMBOL(tcp_read_sock); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; struct sk_buff *skb; int copied = 0; u32 offset; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { u8 tcp_flags; int used; __skb_unlink(skb, &sk->sk_receive_queue); WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); if (used < 0) { if (!copied) copied = used; break; } seq += used; copied += used; if (tcp_flags & TCPHDR_FIN) { ++seq; break; } } return copied; } EXPORT_SYMBOL(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; struct sk_buff *skb; size_t left; u32 offset; if (sk->sk_state == TCP_LISTEN) return; left = len; while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { int used; used = min_t(size_t, skb->len - offset, left); seq += used; left -= used; if (skb->len > offset + used) break; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (left != len) tcp_cleanup_rbuf(sk, len - left); } EXPORT_SYMBOL(tcp_read_done); int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); tcp_sk(sk)->window_clamp = val; } return 0; } EXPORT_SYMBOL(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); else tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else tss->ts[2] = (struct timespec64) {0}; } #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; } EXPORT_SYMBOL(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) { skb_frag_t *frag; if (unlikely(offset_skb >= skb->len)) return NULL; offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; frag = skb_shinfo(skb)->frags; while (offset_skb) { if (skb_frag_size(frag) > offset_skb) { *offset_frag = offset_skb; return frag; } offset_skb -= skb_frag_size(frag); ++frag; } *offset_frag = 0; return frag; } static bool can_map_frag(const skb_frag_t *frag) { return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag); } static int find_next_mappable_frag(const skb_frag_t *frag, int remaining_in_skb) { int offset = 0; if (likely(can_map_frag(frag))) return 0; while (offset < remaining_in_skb && !can_map_frag(frag)) { offset += skb_frag_size(frag); ++frag; } return offset; } static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) { u32 frag_offset, partial_frag_remainder = 0; int mappable_offset; skb_frag_t *frag; /* worst case: skip to next skb. try to improve on this case below */ zc->recv_skip_hint = skb->len - offset; /* Find the frag containing this offset (and how far into that frag) */ frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return; if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb); /* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return; /* Else, we must at least read the remainder in this frag. */ partial_frag_remainder = skb_frag_size(frag) - frag_offset; zc->recv_skip_hint -= partial_frag_remainder; ++frag; } /* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder. */ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); zc->recv_skip_hint = mappable_offset + partial_frag_remainder; } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; zc->length = 0; zc->recv_skip_hint = 0; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_single_range(ITER_DEST, (void __user *)copy_address, inq, &iov, &msg.msg_iter); if (err) return err; err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, tss, &zc->msg_flags); if (err < 0) return err; zc->copybuf_len = err; if (likely(zc->copybuf_len)) { struct sk_buff *skb; u32 offset; skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); if (skb) tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); } return 0; } static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; struct iovec iov; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_single_range(ITER_DEST, (void __user *)copy_address, copylen, &iov, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); if (err) return err; zc->recv_skip_hint -= copylen; *offset += copylen; *seq += copylen; return (__s32)copylen; } static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, struct sock *sk, struct sk_buff *skb, u32 *seq, s32 copybuf_len, struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, *seq, &offset); if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); return zc->copybuf_len < 0 ? 0 : copylen; } static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, int err) { /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { u32 maybe_zap_len; maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ zap_page_range_single(vma, *address, maybe_zap_len, NULL); err = 0; } if (!err) { unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_page_range_single, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); *seq += bytes_mapped; *address += bytes_mapped; } if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before. */ const int bytes_not_mapped = PAGE_SIZE * pages_remaining; *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; *address += bytes_mapped; if (likely(!err)) return 0; /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, err); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { unsigned long msg_control_addr; struct msghdr cmsg_dummy; msg_control_addr = (unsigned long)zc->msg_control; cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); zc->msg_control = (__u64) ((uintptr_t)cmsg_dummy.msg_control_user); zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } } static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); if (vma) { if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } *mmap_locked = false; return vma; } mmap_read_lock(mm); vma = vma_lookup(mm, address); if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } *mmap_locked = true; return vma; } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); bool mmap_locked; int ret; zc->copybuf_len = 0; zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq; if (!inq && sock_flag(sk, SOCK_DONE)) return -EIO; return 0; } vma = find_tcp_vma(current->mm, address, &mmap_locked); if (!vma) return -EINVAL; vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) zap_page_range_single(vma, address, total_bytes_to_map, NULL); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { zc->length = avail_len; zc->recv_skip_hint = avail_len; } ret = 0; while (length + PAGE_SIZE <= zc->length) { int mappable_offset; struct page *page; if (zc->recv_skip_hint < PAGE_SIZE) { u32 offset_frag; if (skb) { if (zc->recv_skip_hint > 0) break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, seq, &offset); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) break; } mappable_offset = find_next_mappable_frag(frags, zc->recv_skip_hint); if (mappable_offset) { zc->recv_skip_hint = mappable_offset; break; } page = skb_frag_page(frags); prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); if (ret) goto out; pages_to_map = 0; } } if (pages_to_map) { ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); } out: if (mmap_locked) mmap_read_unlock(current->mm); else vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; } else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) ret = -EIO; } zc->length = length; return ret; } #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { struct __kernel_old_timeval tv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } } if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { tss->ts[1] = (struct timespec64) {0}; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else put_cmsg_scm_timestamping(msg, tss); } } static int tcp_inq_hint(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u32 copied_seq = READ_ONCE(tp->copied_seq); u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); int inq; inq = rcv_nxt - copied_seq; if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { lock_sock(sk); inq = tp->rcv_nxt - tp->copied_seq; release_sock(sk); } /* After receiving a FIN, tell the user-space to continue reading * by returning a non-zero inq. */ if (inq == 0 && sock_flag(sk, SOCK_DONE)) inq = 1; return inq; } /* * This routine copies from a sock struct into the user buffer. * * Technical note: in 2.3 we work on _locked_ socket, so that * tricks with *seq access order and skb->users are not required. * Probably, code can be easily improved even more. */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; if (tp->recvmsg_inq) { *cmsg_flags = TCP_CMSG_INQ; msg->msg_get_inq = 1; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; if (unlikely(tp->repair)) { err = -EPERM; if (!(flags & MSG_PEEK)) goto out; if (tp->repair_queue == TCP_SEND_QUEUE) goto recv_sndq; err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out; /* 'common' recv queue MSG_PEEK-ing */ } seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_seq = tp->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (!timeo || sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); sk_wait_data(sk, &timeo, last); } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; if (!used) goto skip_copy; } } else used = urg_offset; } } if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; tcp_rcv_space_adjust(sk); skip_copy: if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) continue; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); break; } while (len > 0); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); return copied; out: return err; recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (msg->msg_get_inq) { msg->msg_inq = tcp_inq_hint(sk); if (cmsg_flags & TCP_CMSG_INQ) put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; } EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; /* We defined a new enum for TCP states that are exported in BPF * so as not force the internal TCP states to be frozen. The * following checks will detect if an internal state value ever * differs from the BPF value. If this ever happens, then we will * need to remap the internal value to the BPF value before calling * tcp_call_bpf_2arg. */ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. * But clang built vmlinux does not have this enum in DWARF * since clang removes the above code before generating IR/debuginfo. * Let us explicitly emit the type debuginfo to ensure the * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF * regardless of which compiler is used. */ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == TCP_ESTABLISHED) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } EXPORT_SYMBOL(tcp_shutdown); int tcp_orphan_count_sum(void) { int i, total = 0; for_each_possible_cpu(i) total += per_cpu(tcp_orphan_count, i); return max(total, 0); } static int tcp_orphan_cache; static struct timer_list tcp_orphan_timer; #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) static void tcp_orphan_update(struct timer_list *unused) { WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); } static bool tcp_too_many_orphans(int shift) { return READ_ONCE(tcp_orphan_cache) << shift > READ_ONCE(sysctl_tcp_max_orphans); } bool tcp_check_oom(struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) net_info_ratelimited("too many orphaned sockets\n"); if (out_of_socket_memory) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) len--; data_was_unread += len; __kfree_skb(skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); local_bh_disable(); bh_lock_sock(sk); /* remove backlog if any, without releasing ownership. */ __release_sock(sk); this_cpu_inc(tcp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req; req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); } void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); /* Since we are deleting whole queue, no need to * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } } void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; u32 seq; /* Deny disconnect if other threads are blocked in sk_wait_event() * or inet_wait_for_connect(). */ if (sk->sk_wait_pending) return -EBUSY; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; seq = tp->write_seq + tp->max_window + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; tp->retrans_out = 0; tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; tp->rack.last_delivered = 0; tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; if (!tp->repair) return -EPERM; if (len != sizeof(opt)) return -EINVAL; if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) return -EINVAL; if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) return -EINVAL; if (after(opt.rcv_wup, tp->rcv_nxt)) return -EINVAL; tp->snd_wl1 = opt.snd_wl1; tp->snd_wnd = opt.snd_wnd; tp->max_window = opt.max_window; tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; return 0; } static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; size_t offset = 0; while (len >= sizeof(opt)) { if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; tp->rx_opt.rcv_wscale = rcv_wscale; tp->rx_opt.wscale_ok = 1; } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.tstamp_ok = 1; break; } } return 0; } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_SYMBOL(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { static_branch_enable(&tcp_tx_delay_enabled); pr_info("TCP_TX_DELAY enabled\n"); } } } /* When set indicates to always queue non-full frames. Later the user clears * this option and we transmit any pending partial frames in the queue. This is * meant to be used alongside sendfile() to get properly filled frames when the * user (for example) must write out headers with a write() call first and then * use sendfile to send out the data parts. * * TCP_CORK can be set together with TCP_NODELAY and it is stronger than * TCP_NODELAY. */ void __tcp_sock_set_cork(struct sock *sk, bool on) { struct tcp_sock *tp = tcp_sk(sk); if (on) { tp->nonagle |= TCP_NAGLE_CORK; } else { tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle & TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } } void tcp_sock_set_cork(struct sock *sk, bool on) { lock_sock(sk); __tcp_sock_set_cork(sk, on); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_cork); /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is * remembered, but it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make an explicit push, which overrides * even TCP_CORK for currently queued segments. */ void __tcp_sock_set_nodelay(struct sock *sk, bool on) { if (on) { tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; } } void tcp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); __tcp_sock_set_nodelay(sk, true); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_nodelay); static void __tcp_sock_set_quickack(struct sock *sk, int val) { if (!val) { inet_csk_enter_pingpong_mode(sk); return; } inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) inet_csk_enter_pingpong_mode(sk); } } void tcp_sock_set_quickack(struct sock *sk, int val) { lock_sock(sk); __tcp_sock_set_quickack(sk, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_quickack); int tcp_sock_set_syncnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); int tcp_sock_set_user_timeout(struct sock *sk, int val) { /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; /* Paired with WRITE_ONCE() in keepalive_time_when() */ WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; inet_csk_reset_keepalive_timer(sk, elapsed); } return 0; } int tcp_sock_set_keepidle(struct sock *sk, int val) { int err; lock_sock(sk); err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } EXPORT_SYMBOL(tcp_sock_set_keepidle); int tcp_sock_set_keepintvl(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); int tcp_sock_set_keepcnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; tp->window_clamp = 0; } else { tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? SOCK_MIN_RCVBUF / 2 : val; tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); } return 0; } /* * Socket option code for TCP. */ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int val; int err = 0; /* These are data/string values, all the others are ints */ switch (optname) { case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); sockopt_release_sock(sk); return err; } case TCP_ULP: { char name[TCP_ULP_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; __u8 *backup_key = NULL; /* Allow a backup key as well to facilitate key rotation * First key is the active one. */ if (optlen != TCP_FASTOPEN_KEY_LENGTH && optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) backup_key = key + TCP_FASTOPEN_KEY_LENGTH; return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ break; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; /* Handle options that can be set without locking the socket. */ switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); case TCP_LINGER2: if (val < 0) WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else WRITE_ONCE(tp->linger2, val * HZ); return 0; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; } sockopt_lock_sock(sk); switch (optname) { case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; } tp->rx_opt.user_mss = val; break; case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: if (val < 0 || val > 1) err = -EINVAL; else tp->thin_lto = val; break; case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; break; case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: if (sk->sk_state != TCP_CLOSE) { err = -EPERM; } else if (tp->repair_queue == TCP_SEND_QUEUE) { if (!tcp_rtx_queue_empty(sk)) err = -EPERM; else WRITE_ONCE(tp->write_seq, val); } else if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->rcv_nxt != tp->copied_seq) { err = -EPERM; } else { WRITE_ONCE(tp->rcv_nxt, val); WRITE_ONCE(tp->copied_seq, val); } } else { err = -EINVAL; } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; break; case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { err = -EINVAL; } break; case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else err = -EINVAL; } else { err = -EOPNOTSUPP; } break; case TCP_FASTOPEN_NO_COOKIE: if (val > 1 || val < 0) err = -EINVAL; else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) err = -EINVAL; else tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: if (!tp->repair) err = -EPERM; else WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw()); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; else tp->recvmsg_inq = val; break; case TCP_TX_DELAY: if (val) tcp_enable_tx_delay(); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return err; } int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; enum tcp_chrono i; for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } info->tcpi_busy_time = total; info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); unsigned long rate; u32 now; u64 rate64; bool slow; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) return; info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } if (tp->ecn_flags & TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ info->tcpi_segs_in = READ_ONCE(tp->segs_in); info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; info->tcpi_rcv_wnd = tp->rcv_wnd; info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); static size_t tcp_opt_stats_get_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } /* Returns TTL or hop limit of an incoming packet from skb. */ static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return ip_hdr(skb)->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) return ipv6_hdr(skb)->hop_limit; else return 0; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; unsigned long rate; u64 rate64; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; tcp_get_info_chrono_stats(tp, &info); nla_put_u64_64bit(stats, TCP_NLA_BUSY, info.tcpi_busy_time, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; if (tp->rx_opt.user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); break; case TCP_CORK: val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: val = keepalive_probes(tp); break; case TCP_SYNCNT: val = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; break; case TCP_INFO: { struct tcp_info info; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_CC_INFO: { const struct tcp_congestion_ops *ca_ops; union tcp_cc_info info; size_t sz = 0; int attr; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; case TCP_FASTOPEN_KEY: { u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; case TCP_THIN_DUPACK: val = 0; break; case TCP_REPAIR: val = tp->repair; break; case TCP_REPAIR_QUEUE: if (tp->repair) val = tp->repair_queue; else return -EINVAL; break; case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) return -EINVAL; if (!tp->repair) return -EPERM; opt.snd_wl1 = tp->snd_wl1; opt.snd_wnd = tp->snd_wnd; opt.max_window = tp->max_window; opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; else if (tp->repair_queue == TCP_RECV_QUEUE) val = tp->rcv_nxt; else return -EINVAL; break; case TCP_USER_TIMEOUT: val = READ_ONCE(icsk->icsk_user_timeout); break; case TCP_FASTOPEN: val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: val = tp->fastopen_connect; break; case TCP_FASTOPEN_NO_COOKIE: val = tp->fastopen_no_cookie; break; case TCP_TX_DELAY: val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset); break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; break; case TCP_SAVE_SYN: val = tp->save_syn; break; case TCP_SAVED_SYN: { if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); sockopt_release_sock(sk); } else { sockopt_release_sock(sk); len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_sockptr(optval, sizeof(zc), len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { case offsetofend(struct tcp_zerocopy_receive, msg_flags): goto zerocopy_rcv_cmsg; case offsetofend(struct tcp_zerocopy_receive, msg_controllen): case offsetofend(struct tcp_zerocopy_receive, msg_control): case offsetofend(struct tcp_zerocopy_receive, flags): case offsetofend(struct tcp_zerocopy_receive, copybuf_len): case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); else zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } #endif default: return -ENOPROTOOPT; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } bool tcp_bpf_bypass_getsockopt(int level, int optname) { /* TCP do_tcp_getsockopt has optimized getsockopt implementation * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. */ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) return true; return false; } EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } EXPORT_SYMBOL(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); static bool tcp_md5sig_pool_populated = false; static void __tcp_alloc_md5sig_pool(void) { struct crypto_ahash *hash; int cpu; hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash)) return; for_each_possible_cpu(cpu) { void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; struct ahash_request *req; if (!scratch) { scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr), GFP_KERNEL, cpu_to_node(cpu)); if (!scratch) return; per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; } if (per_cpu(tcp_md5sig_pool, cpu).md5_req) continue; req = ahash_request_alloc(hash, GFP_KERNEL); if (!req) return; ahash_request_set_callback(req, 0, NULL, NULL); per_cpu(tcp_md5sig_pool, cpu).md5_req = req; } /* before setting tcp_md5sig_pool_populated, we must commit all writes * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() * and tcp_get_md5sig_pool(). */ WRITE_ONCE(tcp_md5sig_pool_populated, true); } bool tcp_alloc_md5sig_pool(void) { /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); if (!tcp_md5sig_pool_populated) __tcp_alloc_md5sig_pool(); mutex_unlock(&tcp_md5sig_mutex); } /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ return READ_ONCE(tcp_md5sig_pool_populated); } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); /** * tcp_get_md5sig_pool - get md5sig_pool for this user * * We use percpu structure, so if we succeed, we exit with preemption * and BH disabled, to make sure another thread or softirq handling * wont try to get same context. */ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { local_bh_disable(); /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ if (READ_ONCE(tcp_md5sig_pool_populated)) { /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ smp_rmb(); return this_cpu_ptr(&tcp_md5sig_pool); } local_bh_enable(); return NULL; } EXPORT_SYMBOL(tcp_get_md5sig_pool); int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, const struct sk_buff *skb, unsigned int header_len) { struct scatterlist sg; const struct tcphdr *tp = tcp_hdr(skb); struct ahash_request *req = hp->md5_req; unsigned int i; const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); struct sk_buff *frag_iter; sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); ahash_request_set_crypt(req, &sg, NULL, head_data_len); if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; unsigned int offset = skb_frag_off(f); struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); if (crypto_ahash_update(req)) return 1; } skb_walk_frags(skb, frag_iter) if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) return 1; return 0; } EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; sg_init_one(&sg, key->key, keylen); ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); /* We use data_race() because tcp_md5_do_add() might change key->key under us */ return data_race(crypto_ahash_update(hp->md5_req)); } EXPORT_SYMBOL(tcp_md5_hash_key); /* Called with rcu_read_lock() */ enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif) { /* * This gets called for each TCP segment that arrives * so we want to be efficient. * We have 3 drop cases: * o No MD5 hash and one expected. * o MD5 hash and we're not expecting one. * o MD5 hash and its wrong. */ const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; const struct tcphdr *th = tcp_hdr(skb); const struct tcp_sock *tp = tcp_sk(sk); int genhash, l3index; u8 newhash[16]; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to the l3mdev */ l3index = sdif ? dif : 0; hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) return SKB_NOT_DROPPED_YET; if (hash_expected && !hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return SKB_DROP_REASON_TCP_MD5NOTFOUND; } if (!hash_expected && hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } /* Check the signature. * To support dual stack listeners, we need to handle * IPv4-mapped case. */ if (family == AF_INET) genhash = tcp_v4_md5_hash_skb(newhash, hash_expected, NULL, skb); else genhash = tp->af_specific->calc_md5_hash(newhash, hash_expected, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); if (family == AF_INET) { net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", saddr, ntohs(th->source), daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" : "", l3index); } else { net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", genhash ? "failed" : "mismatch", saddr, ntohs(th->source), daddr, ntohs(th->dest), l3index); } return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } EXPORT_SYMBOL(tcp_inbound_md5_hash); #endif void tcp_done(struct sock *sk) { struct request_sock *req; /* We might be called with a new socket, after * inet_csk_prepare_forced_close() has been called * so we can not use lockdep_sock_is_held(sk) */ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { int state = inet_sk_state_load(sk); if (state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); local_bh_disable(); inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } if (state == TCP_TIME_WAIT) { struct inet_timewait_sock *tw = inet_twsk(sk); refcount_inc(&tw->tw_refcnt); local_bh_disable(); inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; } /* BPF context ensures sock locking. */ if (!has_current_bpf_ctx()) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); } /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { WRITE_ONCE(sk->sk_err, err); /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); } bh_unlock_sock(sk); local_bh_enable(); tcp_write_queue_purge(sk); if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &thash_entries); if (ret) return 0; return 1; } __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 16; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ sysctl_tcp_mem[1] = limit; /* 6.25 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) { int max_rshare, max_wshare, cnt; unsigned long limit; unsigned int i; BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); tcp_hashinfo.bind2_bucket_cachep = kmem_cache_create("tcp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); spin_lock_init(&tcp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); mptcp_init(); } |
| 4 82 205 295 43 296 36 291 225 44 3 3 1 4 72 9 93 93 93 91 173 193 184 104 16 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_INTERVAL_PROBE_TIME_MS, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); p->data[index] = val; } #define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct neighbour __rcu *next; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; unsigned int arp_queue_len_bytes; struct sk_buff_head arp_queue; struct timer_list timer; unsigned long used; atomic_t probes; u8 nud_state; u8 type; u8 dead; u8 protocol; u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; u8 primary_key[0]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; u8 protocol; u32 key[]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct neighbour __rcu **hash_buckets; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; unsigned int entry_size; unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct pneigh_entry **phash_buckets; }; enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE BIT(0) #define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) #define NEIGH_UPDATE_F_USE BIT(3) #define NEIGH_UPDATE_F_MANAGED BIT(4) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 #define NTF_EXT_MASK (NTF_EXT_MANAGED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); for (n = rcu_dereference(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference(n->next)) { if (n->dev == dev && key_eq(n, pkey)) return n; } return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } static inline void neigh_confirm(struct neighbour *n) { if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, int creat); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static __always_inline int neigh_event_send_probe(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) return __neigh_event_send(neigh, skb, immediate_ok); return 0; } static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { return neigh_event_send_probe(neigh, skb, true); } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { const struct hh_cache *hh = &n->hh; /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); return n->output(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { u8 ndm_flags = 0; ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0; if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) { if (ndm_flags & NTF_ROUTER) neigh->flags |= NTF_ROUTER; else neigh->flags &= ~NTF_ROUTER; *notify = 1; } } #endif |
| 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/types.h> struct mptcp_info; struct mptcp_sock; struct seq_file; /* MPTCP sk_buff extension data */ struct mptcp_ext { union { u64 data_ack; u32 data_ack32; }; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, frozen:1, reset_transient:1; u8 reset_reason:4, csum_reqd:1, infinite_map:1; }; #define MPTCPOPT_HMAC_LEN 20 #define MPTCP_RM_IDS_MAX 8 struct mptcp_rm_list { u8 ids[MPTCP_RM_IDS_MAX]; u8 nr; }; struct mptcp_addr_info { u8 id; sa_family_t family; __be16 port; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct in6_addr addr6; #endif }; }; struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; struct mptcp_rm_list rm_list; u8 join_id; u8 backup; u8 reset_reason:4, reset_transient:1, csum_reqd:1, allow_join_id0:1; union { struct { u64 sndr_key; u64 rcvr_key; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; }; struct { struct mptcp_addr_info addr; u64 ahmac; }; struct { struct mptcp_ext ext_copy; u64 fail_seq; }; struct { u32 nonce; u32 token; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; }; }; #endif }; #define MPTCP_SCHED_NAME_MAX 16 #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_sched_data { bool reinject; u8 subflows; struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; }; struct mptcp_sched_ops { int (*get_subflow)(struct mptcp_sock *msk, struct mptcp_sched_data *data); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; struct list_head list; void (*init)(struct mptcp_sock *msk); void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; #ifdef CONFIG_MPTCP void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) { return tcp_sk(sk)->is_mptcp; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp; } static inline bool rsk_drop_req(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; } void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts); void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ static inline void mptcp_skb_ext_move(struct sk_buff *to, struct sk_buff *from) { if (!skb_ext_exist(from, SKB_EXT_MPTCP)) return; if (WARN_ON_ONCE(to->active_extensions)) skb_ext_put(to); to->active_extensions = from->active_extensions; to->extensions = from->extensions; from->active_extensions = 0; } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { struct mptcp_ext *from_ext; from_ext = skb_ext_find(from, SKB_EXT_MPTCP); if (!from_ext) return; from_ext->frozen = 1; skb_ext_copy(to, from); } static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { /* MPTCP always clears the ext when adding it to the skb, so * holes do not bother us here */ return !from_ext || (to_ext && from_ext && !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); } /* check if skbs can be collapsed. * MPTCP collapse is allowed if neither @to or @from carry an mptcp data * mapping, or if the extension of @to is the same as @from. * Collapsing is not possible if @to lacks an extension, but @from carries one. */ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), skb_ext_find(from, SKB_EXT_MPTCP)); } void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { if (skb_ext_exist(skb, SKB_EXT_MPTCP)) return mptcp_get_reset_option(skb); return htonl(0u); } #else static inline void mptcp_init(void) { } static inline bool sk_is_mptcp(const struct sock *sk) { return false; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return false; } static inline bool rsk_drop_req(const struct request_sock *req) { return false; } static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { return true; } static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { } static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return true; } static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { return 0; /* TCP fallback */ } static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { return NULL; } static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); void mptcpv6_handle_mapped(struct sock *sk, bool mapped); #elif IS_ENABLED(CONFIG_IPV6) static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif #if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); #else static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } #endif #if !IS_ENABLED(CONFIG_MPTCP) struct mptcp_sock { }; #endif #endif /* __NET_MPTCP_H */ |
| 1051 1052 10 10 10 10 10 10 10 1061 1057 1058 10 102 102 102 102 102 102 102 102 102 102 102 101 102 102 102 102 69 70 67 2 70 2 67 1046 1051 1047 1051 1051 1050 982 979 982 980 980 984 983 982 984 1053 1051 1053 1046 1048 1049 1049 1049 983 982 70 70 70 70 68 68 68 1045 1045 1045 1045 1046 1047 1050 1047 1049 1046 1047 1045 1048 12 1046 3 4 1 1052 1049 1052 1049 1051 8 5 5 5 5 5 11 8 11 11 11 11 11 5 1047 1049 1 1 1 1 1 1 1 1 1 1 4 4 4 4 20 10 10 235 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 | // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE /* High bits in flags field are unused. */ #define TUN_VNET_LE 0x80000000 #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { return tun->flags & TUN_VNET_BE ? false : virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { int be = !!(tun->flags & TUN_VNET_BE); if (put_user(be, argp)) return -EFAULT; return 0; } static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { int be; if (get_user(be, argp)) return -EFAULT; if (be) tun->flags |= TUN_VNET_BE; else tun->flags &= ~TUN_VNET_BE; return 0; } #else static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { return virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { return -EINVAL; } static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { return -EINVAL; } #endif /* CONFIG_TUN_VNET_CROSS_LE */ static inline bool tun_is_little_endian(struct tun_struct *tun) { return tun->flags & TUN_VNET_LE || tun_legacy_is_little_endian(tun); } static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) { return __virtio16_to_cpu(tun_is_little_endian(tun), val); } static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) { return __cpu_to_virtio16(tun_is_little_endian(tun), val); } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = from_timer(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq = 0; u32 numqueues = 0; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { /* use multiply and shift instead of expensive divide */ txq = ((u64)txq * numqueues) >> 32; } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || (gid_valid(tun->group) && !in_egroup_p(tun->group))) && !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) { free_percpu(dev->tstats); return err; } tun_flow_init(tun); dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->features = dev->hw_features | NETIF_F_LLTX; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); free_percpu(dev->tstats); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter(tfile->socket.sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; } len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* NETIF_F_LLTX requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = dev_get_tstats64, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { del_timer_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it); size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) return err; break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) return err; break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = ¤t->task_frag; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, false); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; } rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); out: rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); if (len < vnet_hdr_sz) return -EINVAL; len -= vnet_hdr_sz; if (!copy_from_iter_full(&gso, sizeof(gso), from)) return -EFAULT; if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); if (tun16_to_cpu(tun, gso.hdr_len) > len) return -EINVAL; iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) linear = good_linear; else linear = tun16_to_cpu(tun, gso.hdr_len); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->callback(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; size_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) return -EINVAL; if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))) return -EFAULT; iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr gso; if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); pr_err("unexpected GSO type: " "0x%x, gso_size %d, hdr_len %d\n", sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), tun16_to_cpu(tun, gso.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); WARN_ON_ONCE(1); return -EINVAL; } if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) return -EFAULT; iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); free_percpu(dev->tstats); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; u32 rxhash = 0, act; int buflen = hdr->buflen; int ret = 0; bool skb_xdp = false; struct page *page; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); xdp_set_data_meta_invalid(xdp); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strcpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int ifindex, carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int sndbuf; int vnet_hdr_sz; int le; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNGETVNETHDRSZ: vnet_hdr_sz = tun->vnet_hdr_sz; if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) ret = -EFAULT; break; case TUNSETVNETHDRSZ: if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) { ret = -EFAULT; break; } if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) { ret = -EINVAL; break; } tun->vnet_hdr_sz = vnet_hdr_sz; break; case TUNGETVNETLE: le = !!(tun->flags & TUN_VNET_LE); if (put_user(le, (int __user *)argp)) ret = -EFAULT; break; case TUNSETVNETLE: if (get_user(le, (int __user *)argp)) { ret = -EFAULT; break; } if (le) tun->flags |= TUN_VNET_LE; else tun->flags &= ~TUN_VNET_LE; break; case TUNGETVNETBE: ret = tun_get_vnet_be(tun, argp); break; case TUNSETVNETBE: ret = tun_set_vnet_be(tun, argp); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = -EINVAL; break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun"); |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peerlookup.h" #include "peer.h" #include "noise.h" static struct hlist_head *pubkey_bucket(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { /* siphash gives us a secure 64bit number based on a random key. Since * the bits are uniformly distributed, we can then mask off to get the * bits we need. */ const u64 hash = siphash(pubkey, NOISE_PUBLIC_KEY_LEN, &table->key); return &table->hashtable[hash & (HASH_SIZE(table->hashtable) - 1)]; } struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void) { struct pubkey_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; get_random_bytes(&table->key, sizeof(table->key)); hash_init(table->hashtable); mutex_init(&table->lock); return table; } void wg_pubkey_hashtable_add(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_add_head_rcu(&peer->pubkey_hash, pubkey_bucket(table, peer->handshake.remote_static)); mutex_unlock(&table->lock); } void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_del_init_rcu(&peer->pubkey_hash); mutex_unlock(&table->lock); } /* Returns a strong reference to a peer */ struct wg_peer * wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { struct wg_peer *iter_peer, *peer = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { if (!memcmp(pubkey, iter_peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN)) { peer = iter_peer; break; } } peer = wg_peer_get_maybe_zero(peer); rcu_read_unlock_bh(); return peer; } static struct hlist_head *index_bucket(struct index_hashtable *table, const __le32 index) { /* Since the indices are random and thus all bits are uniformly * distributed, we can find its bucket simply by masking. */ return &table->hashtable[(__force u32)index & (HASH_SIZE(table->hashtable) - 1)]; } struct index_hashtable *wg_index_hashtable_alloc(void) { struct index_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; hash_init(table->hashtable); spin_lock_init(&table->lock); return table; } /* At the moment, we limit ourselves to 2^20 total peers, which generally might * amount to 2^20*3 items in this hashtable. The algorithm below works by * picking a random number and testing it. We can see that these limits mean we * usually succeed pretty quickly: * * >>> def calculation(tries, size): * ... return (size / 2**32)**(tries - 1) * (1 - (size / 2**32)) * ... * >>> calculation(1, 2**20 * 3) * 0.999267578125 * >>> calculation(2, 2**20 * 3) * 0.0007318854331970215 * >>> calculation(3, 2**20 * 3) * 5.360489012673497e-07 * >>> calculation(4, 2**20 * 3) * 3.9261394135792216e-10 * * At the moment, we don't do any masking, so this algorithm isn't exactly * constant time in either the random guessing or in the hash list lookup. We * could require a minimum of 3 tries, which would successfully mask the * guessing. this would not, however, help with the growing hash lengths, which * is another thing to consider moving forward. */ __le32 wg_index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry) { struct index_hashtable_entry *existing_entry; spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); rcu_read_lock_bh(); search_unused_slot: /* First we try to find an unused slot, randomly, while unlocked. */ entry->index = (__force __le32)get_random_u32(); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) /* If it's already in use, we continue searching. */ goto search_unused_slot; } /* Once we've found an unused slot, we lock it, and then double-check * that nobody else stole it from us. */ spin_lock_bh(&table->lock); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) { spin_unlock_bh(&table->lock); /* If it was stolen, we start over. */ goto search_unused_slot; } } /* Otherwise, we know we have it exclusively (since we're locked), * so we insert. */ hlist_add_head_rcu(&entry->index_hash, index_bucket(table, entry->index)); spin_unlock_bh(&table->lock); rcu_read_unlock_bh(); return entry->index; } bool wg_index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) { bool ret; spin_lock_bh(&table->lock); ret = !hlist_unhashed(&old->index_hash); if (unlikely(!ret)) goto out; new->index = old->index; hlist_replace_rcu(&old->index_hash, &new->index_hash); /* Calling init here NULLs out index_hash, and in fact after this * function returns, it's theoretically possible for this to get * reinserted elsewhere. That means the RCU lookup below might either * terminate early or jump between buckets, in which case the packet * simply gets dropped, which isn't terrible. */ INIT_HLIST_NODE(&old->index_hash); out: spin_unlock_bh(&table->lock); return ret; } void wg_index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry) { spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); } /* Returns a strong reference to a entry->peer */ struct index_hashtable_entry * wg_index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index, struct wg_peer **peer) { struct index_hashtable_entry *iter_entry, *entry = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_entry, index_bucket(table, index), index_hash) { if (iter_entry->index == index) { if (likely(iter_entry->type & type_mask)) entry = iter_entry; break; } } if (likely(entry)) { entry->peer = wg_peer_get_maybe_zero(entry->peer); if (likely(entry->peer)) *peer = entry->peer; else entry = NULL; } rcu_read_unlock_bh(); return entry; } |
| 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } static inline void path_put_init(struct path *path) { path_put(path); *path = (struct path) { }; } #endif /* _LINUX_PATH_H */ |
| 1945 1821 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/instruction_pointer.h> #include <linux/preempt.h> #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); #else static inline bool local_bh_blocked(void) { return false; } #endif #endif /* _LINUX_BH_H */ |
| 6088 2842 5785 6083 5783 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Regents of the University of California */ #ifndef _ASM_RISCV_IRQFLAGS_H #define _ASM_RISCV_IRQFLAGS_H #include <asm/processor.h> #include <asm/csr.h> /* read interrupt enabled status */ static inline unsigned long arch_local_save_flags(void) { return csr_read(CSR_STATUS); } /* unconditionally enable interrupts */ static inline void arch_local_irq_enable(void) { csr_set(CSR_STATUS, SR_IE); } /* unconditionally disable interrupts */ static inline void arch_local_irq_disable(void) { csr_clear(CSR_STATUS, SR_IE); } /* get status and disable interrupts */ static inline unsigned long arch_local_irq_save(void) { return csr_read_clear(CSR_STATUS, SR_IE); } /* test flags */ static inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & SR_IE); } /* test hardware interrupt enable bit */ static inline int arch_irqs_disabled(void) { return arch_irqs_disabled_flags(arch_local_save_flags()); } /* set interrupt enabled status */ static inline void arch_local_irq_restore(unsigned long flags) { csr_set(CSR_STATUS, flags & SR_IE); } #endif /* _ASM_RISCV_IRQFLAGS_H */ |
| 3 4 1 3 6 6 6 5 3 2 5 2 2 2 2 2 1 2 1 2 2165 2172 7 7 7 4 1 1 1 2 2 3 3 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/scatterlist.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/usb.h> #define SIMPLE_IO_TIMEOUT 10000 /* in milliseconds */ /*-------------------------------------------------------------------------*/ static int override_alt = -1; module_param_named(alt, override_alt, int, 0644); MODULE_PARM_DESC(alt, ">= 0 to override altsetting selection"); static void complicated_callback(struct urb *urb); /*-------------------------------------------------------------------------*/ /* FIXME make these public somewhere; usbdevfs.h? */ /* Parameter for usbtest driver. */ struct usbtest_param_32 { /* inputs */ __u32 test_num; /* 0..(TEST_CASES-1) */ __u32 iterations; __u32 length; __u32 vary; __u32 sglen; /* outputs */ __s32 duration_sec; __s32 duration_usec; }; /* * Compat parameter to the usbtest driver. * This supports older user space binaries compiled with 64 bit compiler. */ struct usbtest_param_64 { /* inputs */ __u32 test_num; /* 0..(TEST_CASES-1) */ __u32 iterations; __u32 length; __u32 vary; __u32 sglen; /* outputs */ __s64 duration_sec; __s64 duration_usec; }; /* IOCTL interface to the driver. */ #define USBTEST_REQUEST_32 _IOWR('U', 100, struct usbtest_param_32) /* COMPAT IOCTL interface to the driver. */ #define USBTEST_REQUEST_64 _IOWR('U', 100, struct usbtest_param_64) /*-------------------------------------------------------------------------*/ #define GENERIC /* let probe() bind using module params */ /* Some devices that can be used for testing will have "real" drivers. * Entries for those need to be enabled here by hand, after disabling * that "real" driver. */ //#define IBOT2 /* grab iBOT2 webcams */ //#define KEYSPAN_19Qi /* grab un-renumerated serial adapter */ /*-------------------------------------------------------------------------*/ struct usbtest_info { const char *name; u8 ep_in; /* bulk/intr source */ u8 ep_out; /* bulk/intr sink */ unsigned autoconf:1; unsigned ctrl_out:1; unsigned iso:1; /* try iso in/out */ unsigned intr:1; /* try interrupt in/out */ int alt; }; /* this is accessed only through usbfs ioctl calls. * one ioctl to issue a test ... one lock per device. * tests create other threads if they need them. * urbs and buffers are allocated dynamically, * and data generated deterministically. */ struct usbtest_dev { struct usb_interface *intf; struct usbtest_info *info; int in_pipe; int out_pipe; int in_iso_pipe; int out_iso_pipe; int in_int_pipe; int out_int_pipe; struct usb_endpoint_descriptor *iso_in, *iso_out; struct usb_endpoint_descriptor *int_in, *int_out; struct mutex lock; #define TBUF_SIZE 256 u8 *buf; }; static struct usb_device *testdev_to_usbdev(struct usbtest_dev *test) { return interface_to_usbdev(test->intf); } /* set up all urbs so they can be used with either bulk or interrupt */ #define INTERRUPT_RATE 1 /* msec/transfer */ #define ERROR(tdev, fmt, args...) \ dev_err(&(tdev)->intf->dev , fmt , ## args) #define WARNING(tdev, fmt, args...) \ dev_warn(&(tdev)->intf->dev , fmt , ## args) #define GUARD_BYTE 0xA5 #define MAX_SGLEN 128 /*-------------------------------------------------------------------------*/ static inline void endpoint_update(int edi, struct usb_host_endpoint **in, struct usb_host_endpoint **out, struct usb_host_endpoint *e) { if (edi) { if (!*in) *in = e; } else { if (!*out) *out = e; } } static int get_endpoints(struct usbtest_dev *dev, struct usb_interface *intf) { int tmp; struct usb_host_interface *alt; struct usb_host_endpoint *in, *out; struct usb_host_endpoint *iso_in, *iso_out; struct usb_host_endpoint *int_in, *int_out; struct usb_device *udev; for (tmp = 0; tmp < intf->num_altsetting; tmp++) { unsigned ep; in = out = NULL; iso_in = iso_out = NULL; int_in = int_out = NULL; alt = intf->altsetting + tmp; if (override_alt >= 0 && override_alt != alt->desc.bAlternateSetting) continue; /* take the first altsetting with in-bulk + out-bulk; * ignore other endpoints and altsettings. */ for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) { struct usb_host_endpoint *e; int edi; e = alt->endpoint + ep; edi = usb_endpoint_dir_in(&e->desc); switch (usb_endpoint_type(&e->desc)) { case USB_ENDPOINT_XFER_BULK: endpoint_update(edi, &in, &out, e); continue; case USB_ENDPOINT_XFER_INT: if (dev->info->intr) endpoint_update(edi, &int_in, &int_out, e); continue; case USB_ENDPOINT_XFER_ISOC: if (dev->info->iso) endpoint_update(edi, &iso_in, &iso_out, e); fallthrough; default: continue; } } if ((in && out) || iso_in || iso_out || int_in || int_out) goto found; } return -EINVAL; found: udev = testdev_to_usbdev(dev); dev->info->alt = alt->desc.bAlternateSetting; if (alt->desc.bAlternateSetting != 0) { tmp = usb_set_interface(udev, alt->desc.bInterfaceNumber, alt->desc.bAlternateSetting); if (tmp < 0) return tmp; } if (in) dev->in_pipe = usb_rcvbulkpipe(udev, in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); if (out) dev->out_pipe = usb_sndbulkpipe(udev, out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); if (iso_in) { dev->iso_in = &iso_in->desc; dev->in_iso_pipe = usb_rcvisocpipe(udev, iso_in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } if (iso_out) { dev->iso_out = &iso_out->desc; dev->out_iso_pipe = usb_sndisocpipe(udev, iso_out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } if (int_in) { dev->int_in = &int_in->desc; dev->in_int_pipe = usb_rcvintpipe(udev, int_in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } if (int_out) { dev->int_out = &int_out->desc; dev->out_int_pipe = usb_sndintpipe(udev, int_out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } return 0; } /*-------------------------------------------------------------------------*/ /* Support for testing basic non-queued I/O streams. * * These just package urbs as requests that can be easily canceled. * Each urb's data buffer is dynamically allocated; callers can fill * them with non-zero test data (or test for it) when appropriate. */ static void simple_callback(struct urb *urb) { complete(urb->context); } static struct urb *usbtest_alloc_urb( struct usb_device *udev, int pipe, unsigned long bytes, unsigned transfer_flags, unsigned offset, u8 bInterval, usb_complete_t complete_fn) { struct urb *urb; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) return urb; if (bInterval) usb_fill_int_urb(urb, udev, pipe, NULL, bytes, complete_fn, NULL, bInterval); else usb_fill_bulk_urb(urb, udev, pipe, NULL, bytes, complete_fn, NULL); urb->interval = (udev->speed == USB_SPEED_HIGH) ? (INTERRUPT_RATE << 3) : INTERRUPT_RATE; urb->transfer_flags = transfer_flags; if (usb_pipein(pipe)) urb->transfer_flags |= URB_SHORT_NOT_OK; if ((bytes + offset) == 0) return urb; if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) urb->transfer_buffer = usb_alloc_coherent(udev, bytes + offset, GFP_KERNEL, &urb->transfer_dma); else urb->transfer_buffer = kmalloc(bytes + offset, GFP_KERNEL); if (!urb->transfer_buffer) { usb_free_urb(urb); return NULL; } /* To test unaligned transfers add an offset and fill the unused memory with a guard value */ if (offset) { memset(urb->transfer_buffer, GUARD_BYTE, offset); urb->transfer_buffer += offset; if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) urb->transfer_dma += offset; } /* For inbound transfers use guard byte so that test fails if data not correctly copied */ memset(urb->transfer_buffer, usb_pipein(urb->pipe) ? GUARD_BYTE : 0, bytes); return urb; } static struct urb *simple_alloc_urb( struct usb_device *udev, int pipe, unsigned long bytes, u8 bInterval) { return usbtest_alloc_urb(udev, pipe, bytes, URB_NO_TRANSFER_DMA_MAP, 0, bInterval, simple_callback); } static struct urb *complicated_alloc_urb( struct usb_device *udev, int pipe, unsigned long bytes, u8 bInterval) { return usbtest_alloc_urb(udev, pipe, bytes, URB_NO_TRANSFER_DMA_MAP, 0, bInterval, complicated_callback); } static unsigned pattern; static unsigned mod_pattern; module_param_named(pattern, mod_pattern, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(mod_pattern, "i/o pattern (0 == zeroes)"); static unsigned get_maxpacket(struct usb_device *udev, int pipe) { struct usb_host_endpoint *ep; ep = usb_pipe_endpoint(udev, pipe); return le16_to_cpup(&ep->desc.wMaxPacketSize); } static int ss_isoc_get_packet_num(struct usb_device *udev, int pipe) { struct usb_host_endpoint *ep = usb_pipe_endpoint(udev, pipe); return USB_SS_MULT(ep->ss_ep_comp.bmAttributes) * (1 + ep->ss_ep_comp.bMaxBurst); } static void simple_fill_buf(struct urb *urb) { unsigned i; u8 *buf = urb->transfer_buffer; unsigned len = urb->transfer_buffer_length; unsigned maxpacket; switch (pattern) { default: fallthrough; case 0: memset(buf, 0, len); break; case 1: /* mod63 */ maxpacket = get_maxpacket(urb->dev, urb->pipe); for (i = 0; i < len; i++) *buf++ = (u8) ((i % maxpacket) % 63); break; } } static inline unsigned long buffer_offset(void *buf) { return (unsigned long)buf & (ARCH_KMALLOC_MINALIGN - 1); } static int check_guard_bytes(struct usbtest_dev *tdev, struct urb *urb) { u8 *buf = urb->transfer_buffer; u8 *guard = buf - buffer_offset(buf); unsigned i; for (i = 0; guard < buf; i++, guard++) { if (*guard != GUARD_BYTE) { ERROR(tdev, "guard byte[%d] %d (not %d)\n", i, *guard, GUARD_BYTE); return -EINVAL; } } return 0; } static int simple_check_buf(struct usbtest_dev *tdev, struct urb *urb) { unsigned i; u8 expected; u8 *buf = urb->transfer_buffer; unsigned len = urb->actual_length; unsigned maxpacket = get_maxpacket(urb->dev, urb->pipe); int ret = check_guard_bytes(tdev, urb); if (ret) return ret; for (i = 0; i < len; i++, buf++) { switch (pattern) { /* all-zeroes has no synchronization issues */ case 0: expected = 0; break; /* mod63 stays in sync with short-terminated transfers, * or otherwise when host and gadget agree on how large * each usb transfer request should be. resync is done * with set_interface or set_config. */ case 1: /* mod63 */ expected = (i % maxpacket) % 63; break; /* always fail unsupported patterns */ default: expected = !*buf; break; } if (*buf == expected) continue; ERROR(tdev, "buf[%d] = %d (not %d)\n", i, *buf, expected); return -EINVAL; } return 0; } static void simple_free_urb(struct urb *urb) { unsigned long offset = buffer_offset(urb->transfer_buffer); if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) usb_free_coherent( urb->dev, urb->transfer_buffer_length + offset, urb->transfer_buffer - offset, urb->transfer_dma - offset); else kfree(urb->transfer_buffer - offset); usb_free_urb(urb); } static int simple_io( struct usbtest_dev *tdev, struct urb *urb, int iterations, int vary, int expected, const char *label ) { struct usb_device *udev = urb->dev; int max = urb->transfer_buffer_length; struct completion completion; int retval = 0; unsigned long expire; urb->context = &completion; while (retval == 0 && iterations-- > 0) { init_completion(&completion); if (usb_pipeout(urb->pipe)) { simple_fill_buf(urb); urb->transfer_flags |= URB_ZERO_PACKET; } retval = usb_submit_urb(urb, GFP_KERNEL); if (retval != 0) break; expire = msecs_to_jiffies(SIMPLE_IO_TIMEOUT); if (!wait_for_completion_timeout(&completion, expire)) { usb_kill_urb(urb); retval = (urb->status == -ENOENT ? -ETIMEDOUT : urb->status); } else { retval = urb->status; } urb->dev = udev; if (retval == 0 && usb_pipein(urb->pipe)) retval = simple_check_buf(tdev, urb); if (vary) { int len = urb->transfer_buffer_length; len += vary; len %= max; if (len == 0) len = (vary < max) ? vary : max; urb->transfer_buffer_length = len; } /* FIXME if endpoint halted, clear halt (and log) */ } urb->transfer_buffer_length = max; if (expected != retval) dev_err(&udev->dev, "%s failed, iterations left %d, status %d (not %d)\n", label, iterations, retval, expected); return retval; } /*-------------------------------------------------------------------------*/ /* We use scatterlist primitives to test queued I/O. * Yes, this also tests the scatterlist primitives. */ static void free_sglist(struct scatterlist *sg, int nents) { unsigned i; if (!sg) return; for (i = 0; i < nents; i++) { if (!sg_page(&sg[i])) continue; kfree(sg_virt(&sg[i])); } kfree(sg); } static struct scatterlist * alloc_sglist(int nents, int max, int vary, struct usbtest_dev *dev, int pipe) { struct scatterlist *sg; unsigned int n_size = 0; unsigned i; unsigned size = max; unsigned maxpacket = get_maxpacket(interface_to_usbdev(dev->intf), pipe); if (max == 0) return NULL; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) return NULL; sg_init_table(sg, nents); for (i = 0; i < nents; i++) { char *buf; unsigned j; buf = kzalloc(size, GFP_KERNEL); if (!buf) { free_sglist(sg, i); return NULL; } /* kmalloc pages are always physically contiguous! */ sg_set_buf(&sg[i], buf, size); switch (pattern) { case 0: /* already zeroed */ break; case 1: for (j = 0; j < size; j++) *buf++ = (u8) (((j + n_size) % maxpacket) % 63); n_size += size; break; } if (vary) { size += vary; size %= max; if (size == 0) size = (vary < max) ? vary : max; } } return sg; } struct sg_timeout { struct timer_list timer; struct usb_sg_request *req; }; static void sg_timeout(struct timer_list *t) { struct sg_timeout *timeout = from_timer(timeout, t, timer); usb_sg_cancel(timeout->req); } static int perform_sglist( struct usbtest_dev *tdev, unsigned iterations, int pipe, struct usb_sg_request *req, struct scatterlist *sg, int nents ) { struct usb_device *udev = testdev_to_usbdev(tdev); int retval = 0; struct sg_timeout timeout = { .req = req, }; timer_setup_on_stack(&timeout.timer, sg_timeout, 0); while (retval == 0 && iterations-- > 0) { retval = usb_sg_init(req, udev, pipe, (udev->speed == USB_SPEED_HIGH) ? (INTERRUPT_RATE << 3) : INTERRUPT_RATE, sg, nents, 0, GFP_KERNEL); if (retval) break; mod_timer(&timeout.timer, jiffies + msecs_to_jiffies(SIMPLE_IO_TIMEOUT)); usb_sg_wait(req); if (!del_timer_sync(&timeout.timer)) retval = -ETIMEDOUT; else retval = req->status; destroy_timer_on_stack(&timeout.timer); /* FIXME check resulting data pattern */ /* FIXME if endpoint halted, clear halt (and log) */ } /* FIXME for unlink or fault handling tests, don't report * failure if retval is as we expected ... */ if (retval) ERROR(tdev, "perform_sglist failed, " "iterations left %d, status %d\n", iterations, retval); return retval; } /*-------------------------------------------------------------------------*/ /* unqueued control message testing * * there's a nice set of device functional requirements in chapter 9 of the * usb 2.0 spec, which we can apply to ANY device, even ones that don't use * special test firmware. * * we know the device is configured (or suspended) by the time it's visible * through usbfs. we can't change that, so we won't test enumeration (which * worked 'well enough' to get here, this time), power management (ditto), * or remote wakeup (which needs human interaction). */ static unsigned realworld = 1; module_param(realworld, uint, 0); MODULE_PARM_DESC(realworld, "clear to demand stricter spec compliance"); static int get_altsetting(struct usbtest_dev *dev) { struct usb_interface *iface = dev->intf; struct usb_device *udev = interface_to_usbdev(iface); int retval; retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_INTERFACE, USB_DIR_IN|USB_RECIP_INTERFACE, 0, iface->altsetting[0].desc.bInterfaceNumber, dev->buf, 1, USB_CTRL_GET_TIMEOUT); switch (retval) { case 1: return dev->buf[0]; case 0: retval = -ERANGE; fallthrough; default: return retval; } } static int set_altsetting(struct usbtest_dev *dev, int alternate) { struct usb_interface *iface = dev->intf; struct usb_device *udev; if (alternate < 0 || alternate >= 256) return -EINVAL; udev = interface_to_usbdev(iface); return usb_set_interface(udev, iface->altsetting[0].desc.bInterfaceNumber, alternate); } static int is_good_config(struct usbtest_dev *tdev, int len) { struct usb_config_descriptor *config; if (len < sizeof(*config)) return 0; config = (struct usb_config_descriptor *) tdev->buf; switch (config->bDescriptorType) { case USB_DT_CONFIG: case USB_DT_OTHER_SPEED_CONFIG: if (config->bLength != 9) { ERROR(tdev, "bogus config descriptor length\n"); return 0; } /* this bit 'must be 1' but often isn't */ if (!realworld && !(config->bmAttributes & 0x80)) { ERROR(tdev, "high bit of config attributes not set\n"); return 0; } if (config->bmAttributes & 0x1f) { /* reserved == 0 */ ERROR(tdev, "reserved config bits set\n"); return 0; } break; default: return 0; } if (le16_to_cpu(config->wTotalLength) == len) /* read it all */ return 1; if (le16_to_cpu(config->wTotalLength) >= TBUF_SIZE) /* max partial read */ return 1; ERROR(tdev, "bogus config descriptor read size\n"); return 0; } static int is_good_ext(struct usbtest_dev *tdev, u8 *buf) { struct usb_ext_cap_descriptor *ext; u32 attr; ext = (struct usb_ext_cap_descriptor *) buf; if (ext->bLength != USB_DT_USB_EXT_CAP_SIZE) { ERROR(tdev, "bogus usb 2.0 extension descriptor length\n"); return 0; } attr = le32_to_cpu(ext->bmAttributes); /* bits[1:15] is used and others are reserved */ if (attr & ~0xfffe) { /* reserved == 0 */ ERROR(tdev, "reserved bits set\n"); return 0; } return 1; } static int is_good_ss_cap(struct usbtest_dev *tdev, u8 *buf) { struct usb_ss_cap_descriptor *ss; ss = (struct usb_ss_cap_descriptor *) buf; if (ss->bLength != USB_DT_USB_SS_CAP_SIZE) { ERROR(tdev, "bogus superspeed device capability descriptor length\n"); return 0; } /* * only bit[1] of bmAttributes is used for LTM and others are * reserved */ if (ss->bmAttributes & ~0x02) { /* reserved == 0 */ ERROR(tdev, "reserved bits set in bmAttributes\n"); return 0; } /* bits[0:3] of wSpeedSupported is used and others are reserved */ if (le16_to_cpu(ss->wSpeedSupported) & ~0x0f) { /* reserved == 0 */ ERROR(tdev, "reserved bits set in wSpeedSupported\n"); return 0; } return 1; } static int is_good_con_id(struct usbtest_dev *tdev, u8 *buf) { struct usb_ss_container_id_descriptor *con_id; con_id = (struct usb_ss_container_id_descriptor *) buf; if (con_id->bLength != USB_DT_USB_SS_CONTN_ID_SIZE) { ERROR(tdev, "bogus container id descriptor length\n"); return 0; } if (con_id->bReserved) { /* reserved == 0 */ ERROR(tdev, "reserved bits set\n"); return 0; } return 1; } /* sanity test for standard requests working with usb_control_mesg() and some * of the utility functions which use it. * * this doesn't test how endpoint halts behave or data toggles get set, since * we won't do I/O to bulk/interrupt endpoints here (which is how to change * halt or toggle). toggle testing is impractical without support from hcds. * * this avoids failing devices linux would normally work with, by not testing * config/altsetting operations for devices that only support their defaults. * such devices rarely support those needless operations. * * NOTE that since this is a sanity test, it's not examining boundary cases * to see if usbcore, hcd, and device all behave right. such testing would * involve varied read sizes and other operation sequences. */ static int ch9_postconfig(struct usbtest_dev *dev) { struct usb_interface *iface = dev->intf; struct usb_device *udev = interface_to_usbdev(iface); int i, alt, retval; /* [9.2.3] if there's more than one altsetting, we need to be able to * set and get each one. mostly trusts the descriptors from usbcore. */ for (i = 0; i < iface->num_altsetting; i++) { /* 9.2.3 constrains the range here */ alt = iface->altsetting[i].desc.bAlternateSetting; if (alt < 0 || alt >= iface->num_altsetting) { dev_err(&iface->dev, "invalid alt [%d].bAltSetting = %d\n", i, alt); } /* [real world] get/set unimplemented if there's only one */ if (realworld && iface->num_altsetting == 1) continue; /* [9.4.10] set_interface */ retval = set_altsetting(dev, alt); if (retval) { dev_err(&iface->dev, "can't set_interface = %d, %d\n", alt, retval); return retval; } /* [9.4.4] get_interface always works */ retval = get_altsetting(dev); if (retval != alt) { dev_err(&iface->dev, "get alt should be %d, was %d\n", alt, retval); return (retval < 0) ? retval : -EDOM; } } /* [real world] get_config unimplemented if there's only one */ if (!realworld || udev->descriptor.bNumConfigurations != 1) { int expected = udev->actconfig->desc.bConfigurationValue; /* [9.4.2] get_configuration always works * ... although some cheap devices (like one TI Hub I've got) * won't return config descriptors except before set_config. */ retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_CONFIGURATION, USB_DIR_IN | USB_RECIP_DEVICE, 0, 0, dev->buf, 1, USB_CTRL_GET_TIMEOUT); if (retval != 1 || dev->buf[0] != expected) { dev_err(&iface->dev, "get config --> %d %d (1 %d)\n", retval, dev->buf[0], expected); return (retval < 0) ? retval : -EDOM; } } /* there's always [9.4.3] a device descriptor [9.6.1] */ retval = usb_get_descriptor(udev, USB_DT_DEVICE, 0, dev->buf, sizeof(udev->descriptor)); if (retval != sizeof(udev->descriptor)) { dev_err(&iface->dev, "dev descriptor --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } /* * there's always [9.4.3] a bos device descriptor [9.6.2] in USB * 3.0 spec */ if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0210) { struct usb_bos_descriptor *bos = NULL; struct usb_dev_cap_header *header = NULL; unsigned total, num, length; u8 *buf; retval = usb_get_descriptor(udev, USB_DT_BOS, 0, dev->buf, sizeof(*udev->bos->desc)); if (retval != sizeof(*udev->bos->desc)) { dev_err(&iface->dev, "bos descriptor --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } bos = (struct usb_bos_descriptor *)dev->buf; total = le16_to_cpu(bos->wTotalLength); num = bos->bNumDeviceCaps; if (total > TBUF_SIZE) total = TBUF_SIZE; /* * get generic device-level capability descriptors [9.6.2] * in USB 3.0 spec */ retval = usb_get_descriptor(udev, USB_DT_BOS, 0, dev->buf, total); if (retval != total) { dev_err(&iface->dev, "bos descriptor set --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } length = sizeof(*udev->bos->desc); buf = dev->buf; for (i = 0; i < num; i++) { buf += length; if (buf + sizeof(struct usb_dev_cap_header) > dev->buf + total) break; header = (struct usb_dev_cap_header *)buf; length = header->bLength; if (header->bDescriptorType != USB_DT_DEVICE_CAPABILITY) { dev_warn(&udev->dev, "not device capability descriptor, skip\n"); continue; } switch (header->bDevCapabilityType) { case USB_CAP_TYPE_EXT: if (buf + USB_DT_USB_EXT_CAP_SIZE > dev->buf + total || !is_good_ext(dev, buf)) { dev_err(&iface->dev, "bogus usb 2.0 extension descriptor\n"); return -EDOM; } break; case USB_SS_CAP_TYPE: if (buf + USB_DT_USB_SS_CAP_SIZE > dev->buf + total || !is_good_ss_cap(dev, buf)) { dev_err(&iface->dev, "bogus superspeed device capability descriptor\n"); return -EDOM; } break; case CONTAINER_ID_TYPE: if (buf + USB_DT_USB_SS_CONTN_ID_SIZE > dev->buf + total || !is_good_con_id(dev, buf)) { dev_err(&iface->dev, "bogus container id descriptor\n"); return -EDOM; } break; default: break; } } } /* there's always [9.4.3] at least one config descriptor [9.6.3] */ for (i = 0; i < udev->descriptor.bNumConfigurations; i++) { retval = usb_get_descriptor(udev, USB_DT_CONFIG, i, dev->buf, TBUF_SIZE); if (!is_good_config(dev, retval)) { dev_err(&iface->dev, "config [%d] descriptor --> %d\n", i, retval); return (retval < 0) ? retval : -EDOM; } /* FIXME cross-checking udev->config[i] to make sure usbcore * parsed it right (etc) would be good testing paranoia */ } /* and sometimes [9.2.6.6] speed dependent descriptors */ if (le16_to_cpu(udev->descriptor.bcdUSB) == 0x0200) { struct usb_qualifier_descriptor *d = NULL; /* device qualifier [9.6.2] */ retval = usb_get_descriptor(udev, USB_DT_DEVICE_QUALIFIER, 0, dev->buf, sizeof(struct usb_qualifier_descriptor)); if (retval == -EPIPE) { if (udev->speed == USB_SPEED_HIGH) { dev_err(&iface->dev, "hs dev qualifier --> %d\n", retval); return retval; } /* usb2.0 but not high-speed capable; fine */ } else if (retval != sizeof(struct usb_qualifier_descriptor)) { dev_err(&iface->dev, "dev qualifier --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } else d = (struct usb_qualifier_descriptor *) dev->buf; /* might not have [9.6.2] any other-speed configs [9.6.4] */ if (d) { unsigned max = d->bNumConfigurations; for (i = 0; i < max; i++) { retval = usb_get_descriptor(udev, USB_DT_OTHER_SPEED_CONFIG, i, dev->buf, TBUF_SIZE); if (!is_good_config(dev, retval)) { dev_err(&iface->dev, "other speed config --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } } } } /* FIXME fetch strings from at least the device descriptor */ /* [9.4.5] get_status always works */ retval = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, dev->buf); if (retval) { dev_err(&iface->dev, "get dev status --> %d\n", retval); return retval; } /* FIXME configuration.bmAttributes says if we could try to set/clear * the device's remote wakeup feature ... if we can, test that here */ retval = usb_get_std_status(udev, USB_RECIP_INTERFACE, iface->altsetting[0].desc.bInterfaceNumber, dev->buf); if (retval) { dev_err(&iface->dev, "get interface status --> %d\n", retval); return retval; } /* FIXME get status for each endpoint in the interface */ return 0; } /*-------------------------------------------------------------------------*/ /* use ch9 requests to test whether: * (a) queues work for control, keeping N subtests queued and * active (auto-resubmit) for M loops through the queue. * (b) protocol stalls (control-only) will autorecover. * it's not like bulk/intr; no halt clearing. * (c) short control reads are reported and handled. * (d) queues are always processed in-order */ struct ctrl_ctx { spinlock_t lock; struct usbtest_dev *dev; struct completion complete; unsigned count; unsigned pending; int status; struct urb **urb; struct usbtest_param_32 *param; int last; }; #define NUM_SUBCASES 16 /* how many test subcases here? */ struct subcase { struct usb_ctrlrequest setup; int number; int expected; }; static void ctrl_complete(struct urb *urb) { struct ctrl_ctx *ctx = urb->context; struct usb_ctrlrequest *reqp; struct subcase *subcase; int status = urb->status; unsigned long flags; reqp = (struct usb_ctrlrequest *)urb->setup_packet; subcase = container_of(reqp, struct subcase, setup); spin_lock_irqsave(&ctx->lock, flags); ctx->count--; ctx->pending--; /* queue must transfer and complete in fifo order, unless * usb_unlink_urb() is used to unlink something not at the * physical queue head (not tested). */ if (subcase->number > 0) { if ((subcase->number - ctx->last) != 1) { ERROR(ctx->dev, "subcase %d completed out of order, last %d\n", subcase->number, ctx->last); status = -EDOM; ctx->last = subcase->number; goto error; } } ctx->last = subcase->number; /* succeed or fault in only one way? */ if (status == subcase->expected) status = 0; /* async unlink for cleanup? */ else if (status != -ECONNRESET) { /* some faults are allowed, not required */ if (subcase->expected > 0 && ( ((status == -subcase->expected /* happened */ || status == 0)))) /* didn't */ status = 0; /* sometimes more than one fault is allowed */ else if (subcase->number == 12 && status == -EPIPE) status = 0; else ERROR(ctx->dev, "subtest %d error, status %d\n", subcase->number, status); } /* unexpected status codes mean errors; ideally, in hardware */ if (status) { error: if (ctx->status == 0) { int i; ctx->status = status; ERROR(ctx->dev, "control queue %02x.%02x, err %d, " "%d left, subcase %d, len %d/%d\n", reqp->bRequestType, reqp->bRequest, status, ctx->count, subcase->number, urb->actual_length, urb->transfer_buffer_length); /* FIXME this "unlink everything" exit route should * be a separate test case. */ /* unlink whatever's still pending */ for (i = 1; i < ctx->param->sglen; i++) { struct urb *u = ctx->urb[ (i + subcase->number) % ctx->param->sglen]; if (u == urb || !u->dev) continue; spin_unlock(&ctx->lock); status = usb_unlink_urb(u); spin_lock(&ctx->lock); switch (status) { case -EINPROGRESS: case -EBUSY: case -EIDRM: continue; default: ERROR(ctx->dev, "urb unlink --> %d\n", status); } } status = ctx->status; } } /* resubmit if we need to, else mark this as done */ if ((status == 0) && (ctx->pending < ctx->count)) { status = usb_submit_urb(urb, GFP_ATOMIC); if (status != 0) { ERROR(ctx->dev, "can't resubmit ctrl %02x.%02x, err %d\n", reqp->bRequestType, reqp->bRequest, status); urb->dev = NULL; } else ctx->pending++; } else urb->dev = NULL; /* signal completion when nothing's queued */ if (ctx->pending == 0) complete(&ctx->complete); spin_unlock_irqrestore(&ctx->lock, flags); } static int test_ctrl_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param) { struct usb_device *udev = testdev_to_usbdev(dev); struct urb **urb; struct ctrl_ctx context; int i; if (param->sglen == 0 || param->iterations > UINT_MAX / param->sglen) return -EOPNOTSUPP; spin_lock_init(&context.lock); context.dev = dev; init_completion(&context.complete); context.count = param->sglen * param->iterations; context.pending = 0; context.status = -ENOMEM; context.param = param; context.last = -1; /* allocate and init the urbs we'll queue. * as with bulk/intr sglists, sglen is the queue depth; it also * controls which subtests run (more tests than sglen) or rerun. */ urb = kcalloc(param->sglen, sizeof(struct urb *), GFP_KERNEL); if (!urb) return -ENOMEM; for (i = 0; i < param->sglen; i++) { int pipe = usb_rcvctrlpipe(udev, 0); unsigned len; struct urb *u; struct usb_ctrlrequest req; struct subcase *reqp; /* sign of this variable means: * -: tested code must return this (negative) error code * +: tested code may return this (negative too) error code */ int expected = 0; /* requests here are mostly expected to succeed on any * device, but some are chosen to trigger protocol stalls * or short reads. */ memset(&req, 0, sizeof(req)); req.bRequest = USB_REQ_GET_DESCRIPTOR; req.bRequestType = USB_DIR_IN|USB_RECIP_DEVICE; switch (i % NUM_SUBCASES) { case 0: /* get device descriptor */ req.wValue = cpu_to_le16(USB_DT_DEVICE << 8); len = sizeof(struct usb_device_descriptor); break; case 1: /* get first config descriptor (only) */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); len = sizeof(struct usb_config_descriptor); break; case 2: /* get altsetting (OFTEN STALLS) */ req.bRequest = USB_REQ_GET_INTERFACE; req.bRequestType = USB_DIR_IN|USB_RECIP_INTERFACE; /* index = 0 means first interface */ len = 1; expected = EPIPE; break; case 3: /* get interface status */ req.bRequest = USB_REQ_GET_STATUS; req.bRequestType = USB_DIR_IN|USB_RECIP_INTERFACE; /* interface 0 */ len = 2; break; case 4: /* get device status */ req.bRequest = USB_REQ_GET_STATUS; req.bRequestType = USB_DIR_IN|USB_RECIP_DEVICE; len = 2; break; case 5: /* get device qualifier (MAY STALL) */ req.wValue = cpu_to_le16 (USB_DT_DEVICE_QUALIFIER << 8); len = sizeof(struct usb_qualifier_descriptor); if (udev->speed != USB_SPEED_HIGH) expected = EPIPE; break; case 6: /* get first config descriptor, plus interface */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); len = sizeof(struct usb_config_descriptor); len += sizeof(struct usb_interface_descriptor); break; case 7: /* get interface descriptor (ALWAYS STALLS) */ req.wValue = cpu_to_le16 (USB_DT_INTERFACE << 8); /* interface == 0 */ len = sizeof(struct usb_interface_descriptor); expected = -EPIPE; break; /* NOTE: two consecutive stalls in the queue here. * that tests fault recovery a bit more aggressively. */ case 8: /* clear endpoint halt (MAY STALL) */ req.bRequest = USB_REQ_CLEAR_FEATURE; req.bRequestType = USB_RECIP_ENDPOINT; /* wValue 0 == ep halt */ /* wIndex 0 == ep0 (shouldn't halt!) */ len = 0; pipe = usb_sndctrlpipe(udev, 0); expected = EPIPE; break; case 9: /* get endpoint status */ req.bRequest = USB_REQ_GET_STATUS; req.bRequestType = USB_DIR_IN|USB_RECIP_ENDPOINT; /* endpoint 0 */ len = 2; break; case 10: /* trigger short read (EREMOTEIO) */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); len = 1024; expected = -EREMOTEIO; break; /* NOTE: two consecutive _different_ faults in the queue. */ case 11: /* get endpoint descriptor (ALWAYS STALLS) */ req.wValue = cpu_to_le16(USB_DT_ENDPOINT << 8); /* endpoint == 0 */ len = sizeof(struct usb_interface_descriptor); expected = EPIPE; break; /* NOTE: sometimes even a third fault in the queue! */ case 12: /* get string 0 descriptor (MAY STALL) */ req.wValue = cpu_to_le16(USB_DT_STRING << 8); /* string == 0, for language IDs */ len = sizeof(struct usb_interface_descriptor); /* may succeed when > 4 languages */ expected = EREMOTEIO; /* or EPIPE, if no strings */ break; case 13: /* short read, resembling case 10 */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); /* last data packet "should" be DATA1, not DATA0 */ if (udev->speed == USB_SPEED_SUPER) len = 1024 - 512; else len = 1024 - udev->descriptor.bMaxPacketSize0; expected = -EREMOTEIO; break; case 14: /* short read; try to fill the last packet */ req.wValue = cpu_to_le16((USB_DT_DEVICE << 8) | 0); /* device descriptor size == 18 bytes */ len = udev->descriptor.bMaxPacketSize0; if (udev->speed == USB_SPEED_SUPER) len = 512; switch (len) { case 8: len = 24; break; case 16: len = 32; break; } expected = -EREMOTEIO; break; case 15: req.wValue = cpu_to_le16(USB_DT_BOS << 8); if (udev->bos) len = le16_to_cpu(udev->bos->desc->wTotalLength); else len = sizeof(struct usb_bos_descriptor); if (le16_to_cpu(udev->descriptor.bcdUSB) < 0x0201) expected = -EPIPE; break; default: ERROR(dev, "bogus number of ctrl queue testcases!\n"); context.status = -EINVAL; goto cleanup; } req.wLength = cpu_to_le16(len); urb[i] = u = simple_alloc_urb(udev, pipe, len, 0); if (!u) goto cleanup; reqp = kmalloc(sizeof(*reqp), GFP_KERNEL); if (!reqp) goto cleanup; reqp->setup = req; reqp->number = i % NUM_SUBCASES; reqp->expected = expected; u->setup_packet = (char *) &reqp->setup; u->context = &context; u->complete = ctrl_complete; } /* queue the urbs */ context.urb = urb; spin_lock_irq(&context.lock); for (i = 0; i < param->sglen; i++) { context.status = usb_submit_urb(urb[i], GFP_ATOMIC); if (context.status != 0) { ERROR(dev, "can't submit urb[%d], status %d\n", i, context.status); context.count = context.pending; break; } context.pending++; } spin_unlock_irq(&context.lock); /* FIXME set timer and time out; provide a disconnect hook */ /* wait for the last one to complete */ if (context.pending > 0) wait_for_completion(&context.complete); cleanup: for (i = 0; i < param->sglen; i++) { if (!urb[i]) continue; urb[i]->dev = udev; kfree(urb[i]->setup_packet); simple_free_urb(urb[i]); } kfree(urb); return context.status; } #undef NUM_SUBCASES /*-------------------------------------------------------------------------*/ static void unlink1_callback(struct urb *urb) { int status = urb->status; /* we "know" -EPIPE (stall) never happens */ if (!status) status = usb_submit_urb(urb, GFP_ATOMIC); if (status) { urb->status = status; complete(urb->context); } } static int unlink1(struct usbtest_dev *dev, int pipe, int size, int async) { struct urb *urb; struct completion completion; int retval = 0; init_completion(&completion); urb = simple_alloc_urb(testdev_to_usbdev(dev), pipe, size, 0); if (!urb) return -ENOMEM; urb->context = &completion; urb->complete = unlink1_callback; if (usb_pipeout(urb->pipe)) { simple_fill_buf(urb); urb->transfer_flags |= URB_ZERO_PACKET; } /* keep the endpoint busy. there are lots of hc/hcd-internal * states, and testing should get to all of them over time. * * FIXME want additional tests for when endpoint is STALLing * due to errors, or is just NAKing requests. */ retval = usb_submit_urb(urb, GFP_KERNEL); if (retval != 0) { dev_err(&dev->intf->dev, "submit fail %d\n", retval); return retval; } /* unlinking that should always work. variable delay tests more * hcd states and code paths, even with little other system load. */ msleep(jiffies % (2 * INTERRUPT_RATE)); if (async) { while (!completion_done(&completion)) { retval = usb_unlink_urb(urb); if (retval == 0 && usb_pipein(urb->pipe)) retval = simple_check_buf(dev, urb); switch (retval) { case -EBUSY: case -EIDRM: /* we can't unlink urbs while they're completing * or if they've completed, and we haven't * resubmitted. "normal" drivers would prevent * resubmission, but since we're testing unlink * paths, we can't. */ ERROR(dev, "unlink retry\n"); continue; case 0: case -EINPROGRESS: break; default: dev_err(&dev->intf->dev, "unlink fail %d\n", retval); return retval; } break; } } else usb_kill_urb(urb); wait_for_completion(&completion); retval = urb->status; simple_free_urb(urb); if (async) return (retval == -ECONNRESET) ? 0 : retval - 1000; else return (retval == -ENOENT || retval == -EPERM) ? 0 : retval - 2000; } static int unlink_simple(struct usbtest_dev *dev, int pipe, int len) { int retval = 0; /* test sync and async paths */ retval = unlink1(dev, pipe, len, 1); if (!retval) retval = unlink1(dev, pipe, len, 0); return retval; } /*-------------------------------------------------------------------------*/ struct queued_ctx { struct completion complete; atomic_t pending; unsigned num; int status; struct urb **urbs; }; static void unlink_queued_callback(struct urb *urb) { int status = urb->status; struct queued_ctx *ctx = urb->context; if (ctx->status) goto done; if (urb == ctx->urbs[ctx->num - 4] || urb == ctx->urbs[ctx->num - 2]) { if (status == -ECONNRESET) goto done; /* What error should we report if the URB completed normally? */ } if (status != 0) ctx->status = status; done: if (atomic_dec_and_test(&ctx->pending)) complete(&ctx->complete); } static int unlink_queued(struct usbtest_dev *dev, int pipe, unsigned num, unsigned size) { struct queued_ctx ctx; struct usb_device *udev = testdev_to_usbdev(dev); void *buf; dma_addr_t buf_dma; int i; int retval = -ENOMEM; init_completion(&ctx.complete); atomic_set(&ctx.pending, 1); /* One more than the actual value */ ctx.num = num; ctx.status = 0; buf = usb_alloc_coherent(udev, size, GFP_KERNEL, &buf_dma); if (!buf) return retval; memset(buf, 0, size); /* Allocate and init the urbs we'll queue */ ctx.urbs = kcalloc(num, sizeof(struct urb *), GFP_KERNEL); if (!ctx.urbs) goto free_buf; for (i = 0; i < num; i++) { ctx.urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!ctx.urbs[i]) goto free_urbs; usb_fill_bulk_urb(ctx.urbs[i], udev, pipe, buf, size, unlink_queued_callback, &ctx); ctx.urbs[i]->transfer_dma = buf_dma; ctx.urbs[i]->transfer_flags = URB_NO_TRANSFER_DMA_MAP; if (usb_pipeout(ctx.urbs[i]->pipe)) { simple_fill_buf(ctx.urbs[i]); ctx.urbs[i]->transfer_flags |= URB_ZERO_PACKET; } } /* Submit all the URBs and then unlink URBs num - 4 and num - 2. */ for (i = 0; i < num; i++) { atomic_inc(&ctx.pending); retval = usb_submit_urb(ctx.urbs[i], GFP_KERNEL); if (retval != 0) { dev_err(&dev->intf->dev, "submit urbs[%d] fail %d\n", i, retval); atomic_dec(&ctx.pending); ctx.status = retval; break; } } if (i == num) { usb_unlink_urb(ctx.urbs[num - 4]); usb_unlink_urb(ctx.urbs[num - 2]); } else { while (--i >= 0) usb_unlink_urb(ctx.urbs[i]); } if (atomic_dec_and_test(&ctx.pending)) /* The extra count */ complete(&ctx.complete); wait_for_completion(&ctx.complete); retval = ctx.status; free_urbs: for (i = 0; i < num; i++) usb_free_urb(ctx.urbs[i]); kfree(ctx.urbs); free_buf: usb_free_coherent(udev, size, buf, buf_dma); return retval; } /*-------------------------------------------------------------------------*/ static int verify_not_halted(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; u16 status; /* shouldn't look or act halted */ retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status); if (retval < 0) { ERROR(tdev, "ep %02x couldn't get no-halt status, %d\n", ep, retval); return retval; } if (status != 0) { ERROR(tdev, "ep %02x bogus status: %04x != 0\n", ep, status); return -EINVAL; } retval = simple_io(tdev, urb, 1, 0, 0, __func__); if (retval != 0) return -EINVAL; return 0; } static int verify_halted(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; u16 status; /* should look and act halted */ retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status); if (retval < 0) { ERROR(tdev, "ep %02x couldn't get halt status, %d\n", ep, retval); return retval; } if (status != 1) { ERROR(tdev, "ep %02x bogus status: %04x != 1\n", ep, status); return -EINVAL; } retval = simple_io(tdev, urb, 1, 0, -EPIPE, __func__); if (retval != -EPIPE) return -EINVAL; retval = simple_io(tdev, urb, 1, 0, -EPIPE, "verify_still_halted"); if (retval != -EPIPE) return -EINVAL; return 0; } static int test_halt(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; /* shouldn't look or act halted now */ retval = verify_not_halted(tdev, ep, urb); if (retval < 0) return retval; /* set halt (protocol test only), verify it worked */ retval = usb_control_msg(urb->dev, usb_sndctrlpipe(urb->dev, 0), USB_REQ_SET_FEATURE, USB_RECIP_ENDPOINT, USB_ENDPOINT_HALT, ep, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval < 0) { ERROR(tdev, "ep %02x couldn't set halt, %d\n", ep, retval); return retval; } retval = verify_halted(tdev, ep, urb); if (retval < 0) { int ret; /* clear halt anyways, else further tests will fail */ ret = usb_clear_halt(urb->dev, urb->pipe); if (ret) ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, ret); return retval; } /* clear halt (tests API + protocol), verify it worked */ retval = usb_clear_halt(urb->dev, urb->pipe); if (retval < 0) { ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval); return retval; } retval = verify_not_halted(tdev, ep, urb); if (retval < 0) return retval; /* NOTE: could also verify SET_INTERFACE clear halts ... */ return 0; } static int test_toggle_sync(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; /* clear initial data toggle to DATA0 */ retval = usb_clear_halt(urb->dev, urb->pipe); if (retval < 0) { ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval); return retval; } /* transfer 3 data packets, should be DATA0, DATA1, DATA0 */ retval = simple_io(tdev, urb, 1, 0, 0, __func__); if (retval != 0) return -EINVAL; /* clear halt resets device side data toggle, host should react to it */ retval = usb_clear_halt(urb->dev, urb->pipe); if (retval < 0) { ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval); return retval; } /* host should use DATA0 again after clear halt */ retval = simple_io(tdev, urb, 1, 0, 0, __func__); return retval; } static int halt_simple(struct usbtest_dev *dev) { int ep; int retval = 0; struct urb *urb; struct usb_device *udev = testdev_to_usbdev(dev); if (udev->speed == USB_SPEED_SUPER) urb = simple_alloc_urb(udev, 0, 1024, 0); else urb = simple_alloc_urb(udev, 0, 512, 0); if (urb == NULL) return -ENOMEM; if (dev->in_pipe) { ep = usb_pipeendpoint(dev->in_pipe) | USB_DIR_IN; urb->pipe = dev->in_pipe; retval = test_halt(dev, ep, urb); if (retval < 0) goto done; } if (dev->out_pipe) { ep = usb_pipeendpoint(dev->out_pipe); urb->pipe = dev->out_pipe; retval = test_halt(dev, ep, urb); } done: simple_free_urb(urb); return retval; } static int toggle_sync_simple(struct usbtest_dev *dev) { int ep; int retval = 0; struct urb *urb; struct usb_device *udev = testdev_to_usbdev(dev); unsigned maxp = get_maxpacket(udev, dev->out_pipe); /* * Create a URB that causes a transfer of uneven amount of data packets * This way the clear toggle has an impact on the data toggle sequence. * Use 2 maxpacket length packets and one zero packet. */ urb = simple_alloc_urb(udev, 0, 2 * maxp, 0); if (urb == NULL) return -ENOMEM; urb->transfer_flags |= URB_ZERO_PACKET; ep = usb_pipeendpoint(dev->out_pipe); urb->pipe = dev->out_pipe; retval = test_toggle_sync(dev, ep, urb); simple_free_urb(urb); return retval; } /*-------------------------------------------------------------------------*/ /* Control OUT tests use the vendor control requests from Intel's * USB 2.0 compliance test device: write a buffer, read it back. * * Intel's spec only _requires_ that it work for one packet, which * is pretty weak. Some HCDs place limits here; most devices will * need to be able to handle more than one OUT data packet. We'll * try whatever we're told to try. */ static int ctrl_out(struct usbtest_dev *dev, unsigned count, unsigned length, unsigned vary, unsigned offset) { unsigned i, j, len; int retval; u8 *buf; char *what = "?"; struct usb_device *udev; if (length < 1 || length > 0xffff || vary >= length) return -EINVAL; buf = kmalloc(length + offset, GFP_KERNEL); if (!buf) return -ENOMEM; buf += offset; udev = testdev_to_usbdev(dev); len = length; retval = 0; /* NOTE: hardware might well act differently if we pushed it * with lots back-to-back queued requests. */ for (i = 0; i < count; i++) { /* write patterned data */ for (j = 0; j < len; j++) buf[j] = (u8)(i + j); retval = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x5b, USB_DIR_OUT|USB_TYPE_VENDOR, 0, 0, buf, len, USB_CTRL_SET_TIMEOUT); if (retval != len) { what = "write"; if (retval >= 0) { ERROR(dev, "ctrl_out, wlen %d (expected %d)\n", retval, len); retval = -EBADMSG; } break; } /* read it back -- assuming nothing intervened!! */ retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x5c, USB_DIR_IN|USB_TYPE_VENDOR, 0, 0, buf, len, USB_CTRL_GET_TIMEOUT); if (retval != len) { what = "read"; if (retval >= 0) { ERROR(dev, "ctrl_out, rlen %d (expected %d)\n", retval, len); retval = -EBADMSG; } break; } /* fail if we can't verify */ for (j = 0; j < len; j++) { if (buf[j] != (u8)(i + j)) { ERROR(dev, "ctrl_out, byte %d is %d not %d\n", j, buf[j], (u8)(i + j)); retval = -EBADMSG; break; } } if (retval < 0) { what = "verify"; break; } len += vary; /* [real world] the "zero bytes IN" case isn't really used. * hardware can easily trip up in this weird case, since its * status stage is IN, not OUT like other ep0in transfers. */ if (len > length) len = realworld ? 1 : 0; } if (retval < 0) ERROR(dev, "ctrl_out %s failed, code %d, count %d\n", what, retval, i); kfree(buf - offset); return retval; } /*-------------------------------------------------------------------------*/ /* ISO/BULK tests ... mimics common usage * - buffer length is split into N packets (mostly maxpacket sized) * - multi-buffers according to sglen */ struct transfer_context { unsigned count; unsigned pending; spinlock_t lock; struct completion done; int submit_error; unsigned long errors; unsigned long packet_count; struct usbtest_dev *dev; bool is_iso; }; static void complicated_callback(struct urb *urb) { struct transfer_context *ctx = urb->context; unsigned long flags; spin_lock_irqsave(&ctx->lock, flags); ctx->count--; ctx->packet_count += urb->number_of_packets; if (urb->error_count > 0) ctx->errors += urb->error_count; else if (urb->status != 0) ctx->errors += (ctx->is_iso ? urb->number_of_packets : 1); else if (urb->actual_length != urb->transfer_buffer_length) ctx->errors++; else if (check_guard_bytes(ctx->dev, urb) != 0) ctx->errors++; if (urb->status == 0 && ctx->count > (ctx->pending - 1) && !ctx->submit_error) { int status = usb_submit_urb(urb, GFP_ATOMIC); switch (status) { case 0: goto done; default: dev_err(&ctx->dev->intf->dev, "resubmit err %d\n", status); fallthrough; case -ENODEV: /* disconnected */ case -ESHUTDOWN: /* endpoint disabled */ ctx->submit_error = 1; break; } } ctx->pending--; if (ctx->pending == 0) { if (ctx->errors) dev_err(&ctx->dev->intf->dev, "during the test, %lu errors out of %lu\n", ctx->errors, ctx->packet_count); complete(&ctx->done); } done: spin_unlock_irqrestore(&ctx->lock, flags); } static struct urb *iso_alloc_urb( struct usb_device *udev, int pipe, struct usb_endpoint_descriptor *desc, long bytes, unsigned offset ) { struct urb *urb; unsigned i, maxp, packets; if (bytes < 0 || !desc) return NULL; maxp = usb_endpoint_maxp(desc); if (udev->speed >= USB_SPEED_SUPER) maxp *= ss_isoc_get_packet_num(udev, pipe); else maxp *= usb_endpoint_maxp_mult(desc); packets = DIV_ROUND_UP(bytes, maxp); urb = usb_alloc_urb(packets, GFP_KERNEL); if (!urb) return urb; urb->dev = udev; urb->pipe = pipe; urb->number_of_packets = packets; urb->transfer_buffer_length = bytes; urb->transfer_buffer = usb_alloc_coherent(udev, bytes + offset, GFP_KERNEL, &urb->transfer_dma); if (!urb->transfer_buffer) { usb_free_urb(urb); return NULL; } if (offset) { memset(urb->transfer_buffer, GUARD_BYTE, offset); urb->transfer_buffer += offset; urb->transfer_dma += offset; } /* For inbound transfers use guard byte so that test fails if data not correctly copied */ memset(urb->transfer_buffer, usb_pipein(urb->pipe) ? GUARD_BYTE : 0, bytes); for (i = 0; i < packets; i++) { /* here, only the last packet will be short */ urb->iso_frame_desc[i].length = min((unsigned) bytes, maxp); bytes -= urb->iso_frame_desc[i].length; urb->iso_frame_desc[i].offset = maxp * i; } urb->complete = complicated_callback; /* urb->context = SET BY CALLER */ urb->interval = 1 << (desc->bInterval - 1); urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; return urb; } static int test_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param, int pipe, struct usb_endpoint_descriptor *desc, unsigned offset) { struct transfer_context context; struct usb_device *udev; unsigned i; unsigned long packets = 0; int status = 0; struct urb **urbs; if (!param->sglen || param->iterations > UINT_MAX / param->sglen) return -EINVAL; if (param->sglen > MAX_SGLEN) return -EINVAL; urbs = kcalloc(param->sglen, sizeof(*urbs), GFP_KERNEL); if (!urbs) return -ENOMEM; memset(&context, 0, sizeof(context)); context.count = param->iterations * param->sglen; context.dev = dev; context.is_iso = !!desc; init_completion(&context.done); spin_lock_init(&context.lock); udev = testdev_to_usbdev(dev); for (i = 0; i < param->sglen; i++) { if (context.is_iso) urbs[i] = iso_alloc_urb(udev, pipe, desc, param->length, offset); else urbs[i] = complicated_alloc_urb(udev, pipe, param->length, 0); if (!urbs[i]) { status = -ENOMEM; goto fail; } packets += urbs[i]->number_of_packets; urbs[i]->context = &context; } packets *= param->iterations; if (context.is_iso) { int transaction_num; if (udev->speed >= USB_SPEED_SUPER) transaction_num = ss_isoc_get_packet_num(udev, pipe); else transaction_num = usb_endpoint_maxp_mult(desc); dev_info(&dev->intf->dev, "iso period %d %sframes, wMaxPacket %d, transactions: %d\n", 1 << (desc->bInterval - 1), (udev->speed >= USB_SPEED_HIGH) ? "micro" : "", usb_endpoint_maxp(desc), transaction_num); dev_info(&dev->intf->dev, "total %lu msec (%lu packets)\n", (packets * (1 << (desc->bInterval - 1))) / ((udev->speed >= USB_SPEED_HIGH) ? 8 : 1), packets); } spin_lock_irq(&context.lock); for (i = 0; i < param->sglen; i++) { ++context.pending; status = usb_submit_urb(urbs[i], GFP_ATOMIC); if (status < 0) { ERROR(dev, "submit iso[%d], error %d\n", i, status); if (i == 0) { spin_unlock_irq(&context.lock); goto fail; } simple_free_urb(urbs[i]); urbs[i] = NULL; context.pending--; context.submit_error = 1; break; } } spin_unlock_irq(&context.lock); wait_for_completion(&context.done); for (i = 0; i < param->sglen; i++) { if (urbs[i]) simple_free_urb(urbs[i]); } /* * Isochronous transfers are expected to fail sometimes. As an * arbitrary limit, we will report an error if any submissions * fail or if the transfer failure rate is > 10%. */ if (status != 0) ; else if (context.submit_error) status = -EACCES; else if (context.errors > (context.is_iso ? context.packet_count / 10 : 0)) status = -EIO; kfree(urbs); return status; fail: for (i = 0; i < param->sglen; i++) { if (urbs[i]) simple_free_urb(urbs[i]); } kfree(urbs); return status; } static int test_unaligned_bulk( struct usbtest_dev *tdev, int pipe, unsigned length, int iterations, unsigned transfer_flags, const char *label) { int retval; struct urb *urb = usbtest_alloc_urb(testdev_to_usbdev(tdev), pipe, length, transfer_flags, 1, 0, simple_callback); if (!urb) return -ENOMEM; retval = simple_io(tdev, urb, iterations, 0, 0, label); simple_free_urb(urb); return retval; } /* Run tests. */ static int usbtest_do_ioctl(struct usb_interface *intf, struct usbtest_param_32 *param) { struct usbtest_dev *dev = usb_get_intfdata(intf); struct usb_device *udev = testdev_to_usbdev(dev); struct urb *urb; struct scatterlist *sg; struct usb_sg_request req; unsigned i; int retval = -EOPNOTSUPP; if (param->iterations <= 0) return -EINVAL; if (param->sglen > MAX_SGLEN) return -EINVAL; /* * Just a bunch of test cases that every HCD is expected to handle. * * Some may need specific firmware, though it'd be good to have * one firmware image to handle all the test cases. * * FIXME add more tests! cancel requests, verify the data, control * queueing, concurrent read+write threads, and so on. */ switch (param->test_num) { case 0: dev_info(&intf->dev, "TEST 0: NOP\n"); retval = 0; break; /* Simple non-queued bulk I/O tests */ case 1: if (dev->out_pipe == 0) break; dev_info(&intf->dev, "TEST 1: write %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->out_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test1"); simple_free_urb(urb); break; case 2: if (dev->in_pipe == 0) break; dev_info(&intf->dev, "TEST 2: read %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->in_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test2"); simple_free_urb(urb); break; case 3: if (dev->out_pipe == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 3: write/%d 0..%d bytes %u times\n", param->vary, param->length, param->iterations); urb = simple_alloc_urb(udev, dev->out_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = simple_io(dev, urb, param->iterations, param->vary, 0, "test3"); simple_free_urb(urb); break; case 4: if (dev->in_pipe == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 4: read/%d 0..%d bytes %u times\n", param->vary, param->length, param->iterations); urb = simple_alloc_urb(udev, dev->in_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = simple_io(dev, urb, param->iterations, param->vary, 0, "test4"); simple_free_urb(urb); break; /* Queued bulk I/O tests */ case 5: if (dev->out_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 5: write %d sglists %d entries of %d bytes\n", param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, 0, dev, dev->out_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = perform_sglist(dev, param->iterations, dev->out_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; case 6: if (dev->in_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 6: read %d sglists %d entries of %d bytes\n", param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, 0, dev, dev->in_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = perform_sglist(dev, param->iterations, dev->in_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; case 7: if (dev->out_pipe == 0 || param->sglen == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 7: write/%d %d sglists %d entries 0..%d bytes\n", param->vary, param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, param->vary, dev, dev->out_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = perform_sglist(dev, param->iterations, dev->out_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; case 8: if (dev->in_pipe == 0 || param->sglen == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 8: read/%d %d sglists %d entries 0..%d bytes\n", param->vary, param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, param->vary, dev, dev->in_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = perform_sglist(dev, param->iterations, dev->in_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; /* non-queued sanity tests for control (chapter 9 subset) */ case 9: retval = 0; dev_info(&intf->dev, "TEST 9: ch9 (subset) control tests, %d times\n", param->iterations); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = ch9_postconfig(dev); if (retval) dev_err(&intf->dev, "ch9 subset failed, " "iterations left %d\n", i); break; /* queued control messaging */ case 10: retval = 0; dev_info(&intf->dev, "TEST 10: queue %d control calls, %d times\n", param->sglen, param->iterations); retval = test_ctrl_queue(dev, param); break; /* simple non-queued unlinks (ring with one urb) */ case 11: if (dev->in_pipe == 0 || !param->length) break; retval = 0; dev_info(&intf->dev, "TEST 11: unlink %d reads of %d\n", param->iterations, param->length); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = unlink_simple(dev, dev->in_pipe, param->length); if (retval) dev_err(&intf->dev, "unlink reads failed %d, " "iterations left %d\n", retval, i); break; case 12: if (dev->out_pipe == 0 || !param->length) break; retval = 0; dev_info(&intf->dev, "TEST 12: unlink %d writes of %d\n", param->iterations, param->length); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = unlink_simple(dev, dev->out_pipe, param->length); if (retval) dev_err(&intf->dev, "unlink writes failed %d, " "iterations left %d\n", retval, i); break; /* ep halt tests */ case 13: if (dev->out_pipe == 0 && dev->in_pipe == 0) break; retval = 0; dev_info(&intf->dev, "TEST 13: set/clear %d halts\n", param->iterations); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = halt_simple(dev); if (retval) ERROR(dev, "halts failed, iterations left %d\n", i); break; /* control write tests */ case 14: if (!dev->info->ctrl_out) break; dev_info(&intf->dev, "TEST 14: %d ep0out, %d..%d vary %d\n", param->iterations, realworld ? 1 : 0, param->length, param->vary); retval = ctrl_out(dev, param->iterations, param->length, param->vary, 0); break; /* iso write tests */ case 15: if (dev->out_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 15: write %d iso, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); /* FIRMWARE: iso sink */ retval = test_queue(dev, param, dev->out_iso_pipe, dev->iso_out, 0); break; /* iso read tests */ case 16: if (dev->in_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 16: read %d iso, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); /* FIRMWARE: iso source */ retval = test_queue(dev, param, dev->in_iso_pipe, dev->iso_in, 0); break; /* FIXME scatterlist cancel (needs helper thread) */ /* Tests for bulk I/O using DMA mapping by core and odd address */ case 17: if (dev->out_pipe == 0) break; dev_info(&intf->dev, "TEST 17: write odd addr %d bytes %u times core map\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->out_pipe, param->length, param->iterations, 0, "test17"); break; case 18: if (dev->in_pipe == 0) break; dev_info(&intf->dev, "TEST 18: read odd addr %d bytes %u times core map\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->in_pipe, param->length, param->iterations, 0, "test18"); break; /* Tests for bulk I/O using premapped coherent buffer and odd address */ case 19: if (dev->out_pipe == 0) break; dev_info(&intf->dev, "TEST 19: write odd addr %d bytes %u times premapped\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->out_pipe, param->length, param->iterations, URB_NO_TRANSFER_DMA_MAP, "test19"); break; case 20: if (dev->in_pipe == 0) break; dev_info(&intf->dev, "TEST 20: read odd addr %d bytes %u times premapped\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->in_pipe, param->length, param->iterations, URB_NO_TRANSFER_DMA_MAP, "test20"); break; /* control write tests with unaligned buffer */ case 21: if (!dev->info->ctrl_out) break; dev_info(&intf->dev, "TEST 21: %d ep0out odd addr, %d..%d vary %d\n", param->iterations, realworld ? 1 : 0, param->length, param->vary); retval = ctrl_out(dev, param->iterations, param->length, param->vary, 1); break; /* unaligned iso tests */ case 22: if (dev->out_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 22: write %d iso odd, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); retval = test_queue(dev, param, dev->out_iso_pipe, dev->iso_out, 1); break; case 23: if (dev->in_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 23: read %d iso odd, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); retval = test_queue(dev, param, dev->in_iso_pipe, dev->iso_in, 1); break; /* unlink URBs from a bulk-OUT queue */ case 24: if (dev->out_pipe == 0 || !param->length || param->sglen < 4) break; retval = 0; dev_info(&intf->dev, "TEST 24: unlink from %d queues of " "%d %d-byte writes\n", param->iterations, param->sglen, param->length); for (i = param->iterations; retval == 0 && i > 0; --i) { retval = unlink_queued(dev, dev->out_pipe, param->sglen, param->length); if (retval) { dev_err(&intf->dev, "unlink queued writes failed %d, " "iterations left %d\n", retval, i); break; } } break; /* Simple non-queued interrupt I/O tests */ case 25: if (dev->out_int_pipe == 0) break; dev_info(&intf->dev, "TEST 25: write %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->out_int_pipe, param->length, dev->int_out->bInterval); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: interrupt sink (maybe accepts short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test25"); simple_free_urb(urb); break; case 26: if (dev->in_int_pipe == 0) break; dev_info(&intf->dev, "TEST 26: read %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->in_int_pipe, param->length, dev->int_in->bInterval); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: interrupt source (maybe generates short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test26"); simple_free_urb(urb); break; case 27: /* We do performance test, so ignore data compare */ if (dev->out_pipe == 0 || param->sglen == 0 || pattern != 0) break; dev_info(&intf->dev, "TEST 27: bulk write %dMbytes\n", (param->iterations * param->sglen * param->length) / (1024 * 1024)); retval = test_queue(dev, param, dev->out_pipe, NULL, 0); break; case 28: if (dev->in_pipe == 0 || param->sglen == 0 || pattern != 0) break; dev_info(&intf->dev, "TEST 28: bulk read %dMbytes\n", (param->iterations * param->sglen * param->length) / (1024 * 1024)); retval = test_queue(dev, param, dev->in_pipe, NULL, 0); break; /* Test data Toggle/seq_nr clear between bulk out transfers */ case 29: if (dev->out_pipe == 0) break; retval = 0; dev_info(&intf->dev, "TEST 29: Clear toggle between bulk writes %d times\n", param->iterations); for (i = param->iterations; retval == 0 && i > 0; --i) retval = toggle_sync_simple(dev); if (retval) ERROR(dev, "toggle sync failed, iterations left %d\n", i); break; } return retval; } /*-------------------------------------------------------------------------*/ /* We only have this one interface to user space, through usbfs. * User mode code can scan usbfs to find N different devices (maybe on * different busses) to use when testing, and allocate one thread per * test. So discovery is simplified, and we have no device naming issues. * * Don't use these only as stress/load tests. Use them along with * other USB bus activity: plugging, unplugging, mousing, mp3 playback, * video capture, and so on. Run different tests at different times, in * different sequences. Nothing here should interact with other devices, * except indirectly by consuming USB bandwidth and CPU resources for test * threads and request completion. But the only way to know that for sure * is to test when HC queues are in use by many devices. * * WARNING: Because usbfs grabs udev->dev.sem before calling this ioctl(), * it locks out usbcore in certain code paths. Notably, if you disconnect * the device-under-test, hub_wq will wait block forever waiting for the * ioctl to complete ... so that usb_disconnect() can abort the pending * urbs and then call usbtest_disconnect(). To abort a test, you're best * off just killing the userspace task and waiting for it to exit. */ static int usbtest_ioctl(struct usb_interface *intf, unsigned int code, void *buf) { struct usbtest_dev *dev = usb_get_intfdata(intf); struct usbtest_param_64 *param_64 = buf; struct usbtest_param_32 temp; struct usbtest_param_32 *param_32 = buf; struct timespec64 start; struct timespec64 end; struct timespec64 duration; int retval = -EOPNOTSUPP; /* FIXME USBDEVFS_CONNECTINFO doesn't say how fast the device is. */ pattern = mod_pattern; if (mutex_lock_interruptible(&dev->lock)) return -ERESTARTSYS; /* FIXME: What if a system sleep starts while a test is running? */ /* some devices, like ez-usb default devices, need a non-default * altsetting to have any active endpoints. some tests change * altsettings; force a default so most tests don't need to check. */ if (dev->info->alt >= 0) { if (intf->altsetting->desc.bInterfaceNumber) { retval = -ENODEV; goto free_mutex; } retval = set_altsetting(dev, dev->info->alt); if (retval) { dev_err(&intf->dev, "set altsetting to %d failed, %d\n", dev->info->alt, retval); goto free_mutex; } } switch (code) { case USBTEST_REQUEST_64: temp.test_num = param_64->test_num; temp.iterations = param_64->iterations; temp.length = param_64->length; temp.sglen = param_64->sglen; temp.vary = param_64->vary; param_32 = &temp; break; case USBTEST_REQUEST_32: break; default: retval = -EOPNOTSUPP; goto free_mutex; } ktime_get_ts64(&start); retval = usbtest_do_ioctl(intf, param_32); if (retval < 0) goto free_mutex; ktime_get_ts64(&end); duration = timespec64_sub(end, start); temp.duration_sec = duration.tv_sec; temp.duration_usec = duration.tv_nsec/NSEC_PER_USEC; switch (code) { case USBTEST_REQUEST_32: param_32->duration_sec = temp.duration_sec; param_32->duration_usec = temp.duration_usec; break; case USBTEST_REQUEST_64: param_64->duration_sec = temp.duration_sec; param_64->duration_usec = temp.duration_usec; break; } free_mutex: mutex_unlock(&dev->lock); return retval; } /*-------------------------------------------------------------------------*/ static unsigned force_interrupt; module_param(force_interrupt, uint, 0); MODULE_PARM_DESC(force_interrupt, "0 = test default; else interrupt"); #ifdef GENERIC static unsigned short vendor; module_param(vendor, ushort, 0); MODULE_PARM_DESC(vendor, "vendor code (from usb-if)"); static unsigned short product; module_param(product, ushort, 0); MODULE_PARM_DESC(product, "product code (from vendor)"); #endif static int usbtest_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev; struct usbtest_dev *dev; struct usbtest_info *info; char *rtest, *wtest; char *irtest, *iwtest; char *intrtest, *intwtest; udev = interface_to_usbdev(intf); #ifdef GENERIC /* specify devices by module parameters? */ if (id->match_flags == 0) { /* vendor match required, product match optional */ if (!vendor || le16_to_cpu(udev->descriptor.idVendor) != (u16)vendor) return -ENODEV; if (product && le16_to_cpu(udev->descriptor.idProduct) != (u16)product) return -ENODEV; dev_info(&intf->dev, "matched module params, " "vend=0x%04x prod=0x%04x\n", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); } #endif dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; info = (struct usbtest_info *) id->driver_info; dev->info = info; mutex_init(&dev->lock); dev->intf = intf; /* cacheline-aligned scratch for i/o */ dev->buf = kmalloc(TBUF_SIZE, GFP_KERNEL); if (dev->buf == NULL) { kfree(dev); return -ENOMEM; } /* NOTE this doesn't yet test the handful of difference that are * visible with high speed interrupts: bigger maxpacket (1K) and * "high bandwidth" modes (up to 3 packets/uframe). */ rtest = wtest = ""; irtest = iwtest = ""; intrtest = intwtest = ""; if (force_interrupt || udev->speed == USB_SPEED_LOW) { if (info->ep_in) { dev->in_pipe = usb_rcvintpipe(udev, info->ep_in); rtest = " intr-in"; } if (info->ep_out) { dev->out_pipe = usb_sndintpipe(udev, info->ep_out); wtest = " intr-out"; } } else { if (override_alt >= 0 || info->autoconf) { int status; status = get_endpoints(dev, intf); if (status < 0) { WARNING(dev, "couldn't get endpoints, %d\n", status); kfree(dev->buf); kfree(dev); return status; } /* may find bulk or ISO pipes */ } else { if (info->ep_in) dev->in_pipe = usb_rcvbulkpipe(udev, info->ep_in); if (info->ep_out) dev->out_pipe = usb_sndbulkpipe(udev, info->ep_out); } if (dev->in_pipe) rtest = " bulk-in"; if (dev->out_pipe) wtest = " bulk-out"; if (dev->in_iso_pipe) irtest = " iso-in"; if (dev->out_iso_pipe) iwtest = " iso-out"; if (dev->in_int_pipe) intrtest = " int-in"; if (dev->out_int_pipe) intwtest = " int-out"; } usb_set_intfdata(intf, dev); dev_info(&intf->dev, "%s\n", info->name); dev_info(&intf->dev, "%s {control%s%s%s%s%s%s%s} tests%s\n", usb_speed_string(udev->speed), info->ctrl_out ? " in/out" : "", rtest, wtest, irtest, iwtest, intrtest, intwtest, info->alt >= 0 ? " (+alt)" : ""); return 0; } static int usbtest_suspend(struct usb_interface *intf, pm_message_t message) { return 0; } static int usbtest_resume(struct usb_interface *intf) { return 0; } static void usbtest_disconnect(struct usb_interface *intf) { struct usbtest_dev *dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); dev_dbg(&intf->dev, "disconnect\n"); kfree(dev->buf); kfree(dev); } /* Basic testing only needs a device that can source or sink bulk traffic. * Any device can test control transfers (default with GENERIC binding). * * Several entries work with the default EP0 implementation that's built * into EZ-USB chips. There's a default vendor ID which can be overridden * by (very) small config EEPROMS, but otherwise all these devices act * identically until firmware is loaded: only EP0 works. It turns out * to be easy to make other endpoints work, without modifying that EP0 * behavior. For now, we expect that kind of firmware. */ /* an21xx or fx versions of ez-usb */ static struct usbtest_info ez1_info = { .name = "EZ-USB device", .ep_in = 2, .ep_out = 2, .alt = 1, }; /* fx2 version of ez-usb */ static struct usbtest_info ez2_info = { .name = "FX2 device", .ep_in = 6, .ep_out = 2, .alt = 1, }; /* ezusb family device with dedicated usb test firmware, */ static struct usbtest_info fw_info = { .name = "usb test device", .ep_in = 2, .ep_out = 2, .alt = 1, .autoconf = 1, /* iso and ctrl_out need autoconf */ .ctrl_out = 1, .iso = 1, /* iso_ep's are #8 in/out */ }; /* peripheral running Linux and 'zero.c' test firmware, or * its user-mode cousin. different versions of this use * different hardware with the same vendor/product codes. * host side MUST rely on the endpoint descriptors. */ static struct usbtest_info gz_info = { .name = "Linux gadget zero", .autoconf = 1, .ctrl_out = 1, .iso = 1, .intr = 1, .alt = 0, }; static struct usbtest_info um_info = { .name = "Linux user mode test driver", .autoconf = 1, .alt = -1, }; static struct usbtest_info um2_info = { .name = "Linux user mode ISO test driver", .autoconf = 1, .iso = 1, .alt = -1, }; #ifdef IBOT2 /* this is a nice source of high speed bulk data; * uses an FX2, with firmware provided in the device */ static struct usbtest_info ibot2_info = { .name = "iBOT2 webcam", .ep_in = 2, .alt = -1, }; #endif #ifdef GENERIC /* we can use any device to test control traffic */ static struct usbtest_info generic_info = { .name = "Generic USB device", .alt = -1, }; #endif static const struct usb_device_id id_table[] = { /*-------------------------------------------------------------*/ /* EZ-USB devices which download firmware to replace (or in our * case augment) the default device implementation. */ /* generic EZ-USB FX controller */ { USB_DEVICE(0x0547, 0x2235), .driver_info = (unsigned long) &ez1_info, }, /* CY3671 development board with EZ-USB FX */ { USB_DEVICE(0x0547, 0x0080), .driver_info = (unsigned long) &ez1_info, }, /* generic EZ-USB FX2 controller (or development board) */ { USB_DEVICE(0x04b4, 0x8613), .driver_info = (unsigned long) &ez2_info, }, /* re-enumerated usb test device firmware */ { USB_DEVICE(0xfff0, 0xfff0), .driver_info = (unsigned long) &fw_info, }, /* "Gadget Zero" firmware runs under Linux */ { USB_DEVICE(0x0525, 0xa4a0), .driver_info = (unsigned long) &gz_info, }, /* so does a user-mode variant */ { USB_DEVICE(0x0525, 0xa4a4), .driver_info = (unsigned long) &um_info, }, /* ... and a user-mode variant that talks iso */ { USB_DEVICE(0x0525, 0xa4a3), .driver_info = (unsigned long) &um2_info, }, #ifdef KEYSPAN_19Qi /* Keyspan 19qi uses an21xx (original EZ-USB) */ /* this does not coexist with the real Keyspan 19qi driver! */ { USB_DEVICE(0x06cd, 0x010b), .driver_info = (unsigned long) &ez1_info, }, #endif /*-------------------------------------------------------------*/ #ifdef IBOT2 /* iBOT2 makes a nice source of high speed bulk-in data */ /* this does not coexist with a real iBOT2 driver! */ { USB_DEVICE(0x0b62, 0x0059), .driver_info = (unsigned long) &ibot2_info, }, #endif /*-------------------------------------------------------------*/ #ifdef GENERIC /* module params can specify devices to use for control tests */ { .driver_info = (unsigned long) &generic_info, }, #endif /*-------------------------------------------------------------*/ { } }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_driver usbtest_driver = { .name = "usbtest", .id_table = id_table, .probe = usbtest_probe, .unlocked_ioctl = usbtest_ioctl, .disconnect = usbtest_disconnect, .suspend = usbtest_suspend, .resume = usbtest_resume, }; /*-------------------------------------------------------------------------*/ static int __init usbtest_init(void) { #ifdef GENERIC if (vendor) pr_debug("params: vend=0x%04x prod=0x%04x\n", vendor, product); #endif return usb_register(&usbtest_driver); } module_init(usbtest_init); static void __exit usbtest_exit(void) { usb_deregister(&usbtest_driver); } module_exit(usbtest_exit); MODULE_DESCRIPTION("USB Core/HCD Testing Driver"); MODULE_LICENSE("GPL"); |
| 68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init(&a->ring, size, gfp); } static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */ |
| 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | /* * Copyright (C) 2014 Intel Corporation * * DRM universal plane helper functions * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/list.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_atomic_uapi.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_encoder.h> #include <drm/drm_plane_helper.h> #include <drm/drm_print.h> #include <drm/drm_rect.h> #define SUBPIXEL_MASK 0xffff /** * DOC: overview * * This helper library contains helpers to implement primary plane support on * top of the normal CRTC configuration interface. * Since the legacy &drm_mode_config_funcs.set_config interface ties the primary * plane together with the CRTC state this does not allow userspace to disable * the primary plane itself. The default primary plane only expose XRBG8888 and * ARGB8888 as valid pixel formats for the attached framebuffer. * * Drivers are highly recommended to implement proper support for primary * planes, and newly merged drivers must not rely upon these transitional * helpers. * * The plane helpers share the function table structures with other helpers, * specifically also the atomic helpers. See &struct drm_plane_helper_funcs for * the details. */ /* * Returns the connectors currently associated with a CRTC. This function * should be called twice: once with a NULL connector list to retrieve * the list size, and once with the properly allocated list to be filled in. */ static int get_connectors_for_crtc(struct drm_crtc *crtc, struct drm_connector **connector_list, int num_connectors) { struct drm_device *dev = crtc->dev; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; int count = 0; /* * Note: Once we change the plane hooks to more fine-grained locking we * need to grab the connection_mutex here to be able to make these * checks. */ WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex)); drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder && connector->encoder->crtc == crtc) { if (connector_list != NULL && count < num_connectors) *(connector_list++) = connector; count++; } } drm_connector_list_iter_end(&conn_iter); return count; } static int drm_plane_helper_check_update(struct drm_plane *plane, struct drm_crtc *crtc, struct drm_framebuffer *fb, struct drm_rect *src, struct drm_rect *dst, unsigned int rotation, int min_scale, int max_scale, bool can_position, bool can_update_disabled, bool *visible) { struct drm_plane_state plane_state = { .plane = plane, .crtc = crtc, .fb = fb, .src_x = src->x1, .src_y = src->y1, .src_w = drm_rect_width(src), .src_h = drm_rect_height(src), .crtc_x = dst->x1, .crtc_y = dst->y1, .crtc_w = drm_rect_width(dst), .crtc_h = drm_rect_height(dst), .rotation = rotation, }; struct drm_crtc_state crtc_state = { .crtc = crtc, .enable = crtc->enabled, .mode = crtc->mode, }; int ret; ret = drm_atomic_helper_check_plane_state(&plane_state, &crtc_state, min_scale, max_scale, can_position, can_update_disabled); if (ret) return ret; *src = plane_state.src; *dst = plane_state.dst; *visible = plane_state.visible; return 0; } /** * drm_plane_helper_update_primary - Helper for updating primary planes * @plane: plane to update * @crtc: the plane's new CRTC * @fb: the plane's new framebuffer * @crtc_x: x coordinate within CRTC * @crtc_y: y coordinate within CRTC * @crtc_w: width coordinate within CRTC * @crtc_h: height coordinate within CRTC * @src_x: x coordinate within source * @src_y: y coordinate within source * @src_w: width coordinate within source * @src_h: height coordinate within source * @ctx: modeset locking context * * This helper validates the given parameters and updates the primary plane. * * This function is only useful for non-atomic modesetting. Don't use * it in new drivers. * * Returns: * Zero on success, or an errno code otherwise. */ int drm_plane_helper_update_primary(struct drm_plane *plane, struct drm_crtc *crtc, struct drm_framebuffer *fb, int crtc_x, int crtc_y, unsigned int crtc_w, unsigned int crtc_h, uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h, struct drm_modeset_acquire_ctx *ctx) { struct drm_mode_set set = { .crtc = crtc, .fb = fb, .mode = &crtc->mode, .x = src_x >> 16, .y = src_y >> 16, }; struct drm_rect src = { .x1 = src_x, .y1 = src_y, .x2 = src_x + src_w, .y2 = src_y + src_h, }; struct drm_rect dest = { .x1 = crtc_x, .y1 = crtc_y, .x2 = crtc_x + crtc_w, .y2 = crtc_y + crtc_h, }; struct drm_device *dev = plane->dev; struct drm_connector **connector_list; int num_connectors, ret; bool visible; if (drm_WARN_ON_ONCE(dev, drm_drv_uses_atomic_modeset(dev))) return -EINVAL; ret = drm_plane_helper_check_update(plane, crtc, fb, &src, &dest, DRM_MODE_ROTATE_0, DRM_PLANE_NO_SCALING, DRM_PLANE_NO_SCALING, false, false, &visible); if (ret) return ret; if (!visible) /* * Primary plane isn't visible. Note that unless a driver * provides their own disable function, this will just * wind up returning -EINVAL to userspace. */ return plane->funcs->disable_plane(plane, ctx); /* Find current connectors for CRTC */ num_connectors = get_connectors_for_crtc(crtc, NULL, 0); BUG_ON(num_connectors == 0); connector_list = kcalloc(num_connectors, sizeof(*connector_list), GFP_KERNEL); if (!connector_list) return -ENOMEM; get_connectors_for_crtc(crtc, connector_list, num_connectors); set.connectors = connector_list; set.num_connectors = num_connectors; /* * We call set_config() directly here rather than using * drm_mode_set_config_internal. We're reprogramming the same * connectors that were already in use, so we shouldn't need the extra * cross-CRTC fb refcounting to accommodate stealing connectors. * drm_mode_setplane() already handles the basic refcounting for the * framebuffers involved in this operation. */ ret = crtc->funcs->set_config(&set, ctx); kfree(connector_list); return ret; } EXPORT_SYMBOL(drm_plane_helper_update_primary); /** * drm_plane_helper_disable_primary - Helper for disabling primary planes * @plane: plane to disable * @ctx: modeset locking context * * This helper returns an error when trying to disable the primary * plane. * * This function is only useful for non-atomic modesetting. Don't use * it in new drivers. * * Returns: * An errno code. */ int drm_plane_helper_disable_primary(struct drm_plane *plane, struct drm_modeset_acquire_ctx *ctx) { struct drm_device *dev = plane->dev; drm_WARN_ON_ONCE(dev, drm_drv_uses_atomic_modeset(dev)); return -EINVAL; } EXPORT_SYMBOL(drm_plane_helper_disable_primary); /** * drm_plane_helper_destroy() - Helper for primary plane destruction * @plane: plane to destroy * * Provides a default plane destroy handler for primary planes. This handler * is called during CRTC destruction. We disable the primary plane, remove * it from the DRM plane list, and deallocate the plane structure. */ void drm_plane_helper_destroy(struct drm_plane *plane) { drm_plane_cleanup(plane); kfree(plane); } EXPORT_SYMBOL(drm_plane_helper_destroy); /** * drm_plane_helper_atomic_check() - Helper to check plane atomic-state * @plane: plane to check * @state: atomic state object * * Provides a default plane-state check handler for planes whose atomic-state * scale and positioning are not expected to change since the plane is always * a fullscreen scanout buffer. * * This is often the case for the primary plane of simple framebuffers. See * also drm_crtc_helper_atomic_check() for the respective CRTC-state check * helper function. * * RETURNS: * Zero on success, or an errno code otherwise. */ int drm_plane_helper_atomic_check(struct drm_plane *plane, struct drm_atomic_state *state) { struct drm_plane_state *new_plane_state = drm_atomic_get_new_plane_state(state, plane); struct drm_crtc *new_crtc = new_plane_state->crtc; struct drm_crtc_state *new_crtc_state = NULL; if (new_crtc) new_crtc_state = drm_atomic_get_new_crtc_state(state, new_crtc); return drm_atomic_helper_check_plane_state(new_plane_state, new_crtc_state, DRM_PLANE_NO_SCALING, DRM_PLANE_NO_SCALING, false, false); } EXPORT_SYMBOL(drm_plane_helper_atomic_check); |
| 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0+ #include <linux/iosys-map.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_blend.h> #include <drm/drm_fourcc.h> #include <drm/drm_gem_atomic_helper.h> #include <drm/drm_gem_framebuffer_helper.h> #include "vkms_drv.h" #include "vkms_formats.h" static const u32 vkms_formats[] = { DRM_FORMAT_ARGB8888, DRM_FORMAT_XRGB8888, DRM_FORMAT_XRGB16161616, DRM_FORMAT_ARGB16161616, DRM_FORMAT_RGB565 }; static struct drm_plane_state * vkms_plane_duplicate_state(struct drm_plane *plane) { struct vkms_plane_state *vkms_state; struct vkms_frame_info *frame_info; vkms_state = kzalloc(sizeof(*vkms_state), GFP_KERNEL); if (!vkms_state) return NULL; frame_info = kzalloc(sizeof(*frame_info), GFP_KERNEL); if (!frame_info) { DRM_DEBUG_KMS("Couldn't allocate frame_info\n"); kfree(vkms_state); return NULL; } vkms_state->frame_info = frame_info; __drm_gem_duplicate_shadow_plane_state(plane, &vkms_state->base); return &vkms_state->base.base; } static void vkms_plane_destroy_state(struct drm_plane *plane, struct drm_plane_state *old_state) { struct vkms_plane_state *vkms_state = to_vkms_plane_state(old_state); struct drm_crtc *crtc = vkms_state->base.base.crtc; if (crtc && vkms_state->frame_info->fb) { /* dropping the reference we acquired in * vkms_primary_plane_update() */ if (drm_framebuffer_read_refcount(vkms_state->frame_info->fb)) drm_framebuffer_put(vkms_state->frame_info->fb); } kfree(vkms_state->frame_info); vkms_state->frame_info = NULL; __drm_gem_destroy_shadow_plane_state(&vkms_state->base); kfree(vkms_state); } static void vkms_plane_reset(struct drm_plane *plane) { struct vkms_plane_state *vkms_state; if (plane->state) { vkms_plane_destroy_state(plane, plane->state); plane->state = NULL; /* must be set to NULL here */ } vkms_state = kzalloc(sizeof(*vkms_state), GFP_KERNEL); if (!vkms_state) { DRM_ERROR("Cannot allocate vkms_plane_state\n"); return; } __drm_gem_reset_shadow_plane(plane, &vkms_state->base); } static const struct drm_plane_funcs vkms_plane_funcs = { .update_plane = drm_atomic_helper_update_plane, .disable_plane = drm_atomic_helper_disable_plane, .reset = vkms_plane_reset, .atomic_duplicate_state = vkms_plane_duplicate_state, .atomic_destroy_state = vkms_plane_destroy_state, }; static void vkms_plane_atomic_update(struct drm_plane *plane, struct drm_atomic_state *state) { struct drm_plane_state *new_state = drm_atomic_get_new_plane_state(state, plane); struct vkms_plane_state *vkms_plane_state; struct drm_shadow_plane_state *shadow_plane_state; struct drm_framebuffer *fb = new_state->fb; struct vkms_frame_info *frame_info; u32 fmt; if (!new_state->crtc || !fb) return; fmt = fb->format->format; vkms_plane_state = to_vkms_plane_state(new_state); shadow_plane_state = &vkms_plane_state->base; frame_info = vkms_plane_state->frame_info; memcpy(&frame_info->src, &new_state->src, sizeof(struct drm_rect)); memcpy(&frame_info->dst, &new_state->dst, sizeof(struct drm_rect)); memcpy(&frame_info->rotated, &new_state->dst, sizeof(struct drm_rect)); frame_info->fb = fb; memcpy(&frame_info->map, &shadow_plane_state->data, sizeof(frame_info->map)); drm_framebuffer_get(frame_info->fb); frame_info->rotation = drm_rotation_simplify(new_state->rotation, DRM_MODE_ROTATE_0 | DRM_MODE_ROTATE_90 | DRM_MODE_ROTATE_270 | DRM_MODE_REFLECT_X | DRM_MODE_REFLECT_Y); drm_rect_rotate(&frame_info->rotated, drm_rect_width(&frame_info->rotated), drm_rect_height(&frame_info->rotated), frame_info->rotation); frame_info->offset = fb->offsets[0]; frame_info->pitch = fb->pitches[0]; frame_info->cpp = fb->format->cpp[0]; vkms_plane_state->pixel_read = get_pixel_conversion_function(fmt); } static int vkms_plane_atomic_check(struct drm_plane *plane, struct drm_atomic_state *state) { struct drm_plane_state *new_plane_state = drm_atomic_get_new_plane_state(state, plane); struct drm_crtc_state *crtc_state; int ret; if (!new_plane_state->fb || WARN_ON(!new_plane_state->crtc)) return 0; crtc_state = drm_atomic_get_crtc_state(state, new_plane_state->crtc); if (IS_ERR(crtc_state)) return PTR_ERR(crtc_state); ret = drm_atomic_helper_check_plane_state(new_plane_state, crtc_state, DRM_PLANE_NO_SCALING, DRM_PLANE_NO_SCALING, true, true); if (ret != 0) return ret; return 0; } static int vkms_prepare_fb(struct drm_plane *plane, struct drm_plane_state *state) { struct drm_shadow_plane_state *shadow_plane_state; struct drm_framebuffer *fb = state->fb; int ret; if (!fb) return 0; shadow_plane_state = to_drm_shadow_plane_state(state); ret = drm_gem_plane_helper_prepare_fb(plane, state); if (ret) return ret; return drm_gem_fb_vmap(fb, shadow_plane_state->map, shadow_plane_state->data); } static void vkms_cleanup_fb(struct drm_plane *plane, struct drm_plane_state *state) { struct drm_shadow_plane_state *shadow_plane_state; struct drm_framebuffer *fb = state->fb; if (!fb) return; shadow_plane_state = to_drm_shadow_plane_state(state); drm_gem_fb_vunmap(fb, shadow_plane_state->map); } static const struct drm_plane_helper_funcs vkms_plane_helper_funcs = { .atomic_update = vkms_plane_atomic_update, .atomic_check = vkms_plane_atomic_check, .prepare_fb = vkms_prepare_fb, .cleanup_fb = vkms_cleanup_fb, }; struct vkms_plane *vkms_plane_init(struct vkms_device *vkmsdev, enum drm_plane_type type, int index) { struct drm_device *dev = &vkmsdev->drm; struct vkms_plane *plane; plane = drmm_universal_plane_alloc(dev, struct vkms_plane, base, 1 << index, &vkms_plane_funcs, vkms_formats, ARRAY_SIZE(vkms_formats), NULL, type, NULL); if (IS_ERR(plane)) return plane; drm_plane_helper_add(&plane->base, &vkms_plane_helper_funcs); drm_plane_create_rotation_property(&plane->base, DRM_MODE_ROTATE_0, DRM_MODE_ROTATE_MASK | DRM_MODE_REFLECT_MASK); return plane; } |
| 927 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Directory notifications for Linux. * * Copyright (C) 2000,2001,2002 Stephen Rothwell * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * dnotify was largly rewritten to use the new fsnotify infrastructure */ #include <linux/fs.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/dnotify.h> #include <linux/init.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/fdtable.h> #include <linux/fsnotify_backend.h> static int dir_notify_enable __read_mostly = 1; #ifdef CONFIG_SYSCTL static struct ctl_table dnotify_sysctls[] = { { .procname = "dir-notify-enable", .data = &dir_notify_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, {} }; static void __init dnotify_sysctl_init(void) { register_sysctl_init("fs", dnotify_sysctls); } #else #define dnotify_sysctl_init() do { } while (0) #endif static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; static struct fsnotify_group *dnotify_group __read_mostly; /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which * is being watched by dnotify. If multiple userspace applications are watching * the same directory with dnotify their information is chained in dn */ struct dnotify_mark { struct fsnotify_mark fsn_mark; struct dnotify_struct *dn; }; /* * When a process starts or stops watching an inode the set of events which * dnotify cares about for that inode may change. This function runs the * list of everything receiving dnotify events about this directory and calculates * the set of all those events. After it updates what dnotify is interested in * it calls the fsnotify function so it can update the set of all events relevant * to this inode. */ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) { __u32 new_mask = 0; struct dnotify_struct *dn; struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); assert_spin_locked(&fsn_mark->lock); for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); if (fsn_mark->mask == new_mask) return; fsn_mark->mask = new_mask; fsnotify_recalc_mask(fsn_mark->connector); } /* * Mains fsnotify call where events are delivered to dnotify. * Find the dnotify mark on the relevant inode, run the list of dnotify structs * on that mark and determine which of them has expressed interest in receiving * events of this type. When found send the correct process and signal and * destroy the dnotify struct if it was not registered to receive multiple * events. */ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; /* not a dir, dnotify doesn't care */ if (!dir && !(mask & FS_ISDIR)) return 0; dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); spin_lock(&inode_mark->lock); prev = &dn_mark->dn; while ((dn = *prev) != NULL) { if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } fown = &dn->dn_filp->f_owner; send_sigio(fown, dn->dn_fd, POLL_MSG); if (dn->dn_mask & FS_DN_MULTISHOT) prev = &dn->dn_next; else { *prev = dn->dn_next; kmem_cache_free(dnotify_struct_cache, dn); dnotify_recalc_inode_mask(inode_mark); } } spin_unlock(&inode_mark->lock); return 0; } static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); BUG_ON(dn_mark->dn); kmem_cache_free(dnotify_mark_cache, dn_mark); } static const struct fsnotify_ops dnotify_fsnotify_ops = { .handle_inode_event = dnotify_handle_event, .free_mark = dnotify_free_mark, }; /* * Called every time a file is closed. Looks first for a dnotify mark on the * inode. If one is found run all of the ->dn structures attached to that * mark for one relevant to this process closing the file and remove that * dnotify_struct. If that was the last dnotify_struct also remove the * fsnotify_mark. */ void dnotify_flush(struct file *filp, fl_owner_t id) { struct fsnotify_mark *fsn_mark; struct dnotify_mark *dn_mark; struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; bool free = false; inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) return; fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); fsnotify_group_lock(dnotify_group); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; while ((dn = *prev) != NULL) { if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { *prev = dn->dn_next; kmem_cache_free(dnotify_struct_cache, dn); dnotify_recalc_inode_mask(fsn_mark); break; } prev = &dn->dn_next; } spin_unlock(&fsn_mark->lock); /* nothing else could have found us thanks to the dnotify_groups mark_mutex */ if (dn_mark->dn == NULL) { fsnotify_detach_mark(fsn_mark); free = true; } fsnotify_group_unlock(dnotify_group); if (free) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); } /* this conversion is done only at watch creation */ static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; if (arg & DN_MULTISHOT) new_mask |= FS_DN_MULTISHOT; if (arg & DN_DELETE) new_mask |= (FS_DELETE | FS_MOVED_FROM); if (arg & DN_MODIFY) new_mask |= FS_MODIFY; if (arg & DN_ACCESS) new_mask |= FS_ACCESS; if (arg & DN_ATTRIB) new_mask |= FS_ATTRIB; if (arg & DN_RENAME) new_mask |= FS_RENAME; if (arg & DN_CREATE) new_mask |= (FS_CREATE | FS_MOVED_TO); return new_mask; } /* * If multiple processes watch the same inode with dnotify there is only one * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct * onto that mark. This function either attaches the new dnotify_struct onto * that list, or it |= the mask onto an existing dnofiy_struct. */ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, fl_owner_t id, int fd, struct file *filp, __u32 mask) { struct dnotify_struct *odn; odn = dn_mark->dn; while (odn != NULL) { /* adding more events to existing dnofiy_struct? */ if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { odn->dn_fd = fd; odn->dn_mask |= mask; return -EEXIST; } odn = odn->dn_next; } dn->dn_mask = mask; dn->dn_fd = fd; dn->dn_filp = filp; dn->dn_owner = id; dn->dn_next = dn_mark->dn; dn_mark->dn = dn; return 0; } /* * When a process calls fcntl to attach a dnotify watch to a directory it ends * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; struct file *f; int destroy = 0, error = 0; __u32 mask; /* we use these to tell if we need to kfree */ new_fsn_mark = NULL; dn = NULL; if (!dir_notify_enable) { error = -EINVAL; goto out_err; } /* a 0 mask means we are explicitly removing the watch */ if ((arg & ~DN_MULTISHOT) == 0) { dnotify_flush(filp, id); error = 0; goto out_err; } /* dnotify only works on directories */ inode = file_inode(filp); if (!S_ISDIR(inode->i_mode)) { error = -ENOTDIR; goto out_err; } /* * convert the userspace DN_* "arg" to the internal FS_* * defined in fsnotify */ mask = convert_arg(arg); error = security_path_notify(&filp->f_path, mask, FSNOTIFY_OBJ_TYPE_INODE); if (error) goto out_err; /* expect most fcntl to add new rather than augment old */ dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); if (!dn) { error = -ENOMEM; goto out_err; } /* new fsnotify mark, we expect most fcntl calls to add a new mark */ new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL); if (!new_dn_mark) { error = -ENOMEM; goto out_err; } /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); new_fsn_mark->mask = mask; new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ fsnotify_group_lock(dnotify_group); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0); if (error) { fsnotify_group_unlock(dnotify_group); goto out_err; } spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; /* we used new_fsn_mark, so don't free it */ new_fsn_mark = NULL; } rcu_read_lock(); f = lookup_fd_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the * fd is the only time we clean up the marks we need to get our mark * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too * since multiple calls to destroy_mark is perfectly safe, if * we found a dn_mark already attached to the inode, just sod * off silently as the flush at close time dealt with it. */ if (dn_mark == new_dn_mark) destroy = 1; error = 0; goto out; } __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ if (!error) dn = NULL; /* -EEXIST means that we didn't add this new dn and used an old one. * that isn't an error (and the unused dn should be freed) */ else if (error == -EEXIST) error = 0; dnotify_recalc_inode_mask(fsn_mark); out: spin_unlock(&fsn_mark->lock); if (destroy) fsnotify_detach_mark(fsn_mark); fsnotify_group_unlock(dnotify_group); if (destroy) fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); return error; } static int __init dnotify_init(void) { dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC|SLAB_ACCOUNT); dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, FSNOTIFY_GROUP_NOFS); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); dnotify_sysctl_init(); return 0; } module_init(dnotify_init) |
| 292 292 292 276 276 276 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Cryptographic Hash operations. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" #define MAX_SHASH_ALIGNMASK 63 static const struct crypto_type crypto_shash_type; static inline struct crypto_istat_hash *shash_get_stat(struct shash_alg *alg) { return hash_get_stat(&alg->halg); } static inline int crypto_shash_errstat(struct shash_alg *alg, int err) { return crypto_hash_errstat(&alg->halg, err); } int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } EXPORT_SYMBOL_GPL(shash_no_setkey); static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); unsigned long absize; u8 *buffer, *alignbuffer; int err; absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); err = shash->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return err; } static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg) { if (crypto_shash_alg_needs_key(alg)) crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if ((unsigned long)key & alignmask) err = shash_setkey_unaligned(tfm, key, keylen); else err = shash->setkey(tfm, key, keylen); if (unlikely(err)) { shash_set_needkey(tfm, shash); return err; } crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_setkey); static int shash_update_unaligned(struct shash_desc *desc, const u8 *data, unsigned int len) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); unsigned int unaligned_len = alignmask + 1 - ((unsigned long)data & alignmask); /* * We cannot count on __aligned() working for large values: * https://patchwork.kernel.org/patch/9507697/ */ u8 ubuf[MAX_SHASH_ALIGNMASK * 2]; u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); int err; if (WARN_ON(buf + unaligned_len > ubuf + sizeof(ubuf))) return -EINVAL; if (unaligned_len > len) unaligned_len = len; memcpy(buf, data, unaligned_len); err = shash->update(desc, buf, unaligned_len); memset(buf, 0, unaligned_len); return err ?: shash->update(desc, data + unaligned_len, len - unaligned_len); } int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_add(len, &shash_get_stat(shash)->hash_tlen); if ((unsigned long)data & alignmask) err = shash_update_unaligned(desc, data, len); else err = shash->update(desc, data, len); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_update); static int shash_final_unaligned(struct shash_desc *desc, u8 *out) { struct crypto_shash *tfm = desc->tfm; unsigned long alignmask = crypto_shash_alignmask(tfm); struct shash_alg *shash = crypto_shash_alg(tfm); unsigned int ds = crypto_shash_digestsize(tfm); /* * We cannot count on __aligned() working for large values: * https://patchwork.kernel.org/patch/9507697/ */ u8 ubuf[MAX_SHASH_ALIGNMASK + HASH_MAX_DIGESTSIZE]; u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); int err; if (WARN_ON(buf + ds > ubuf + sizeof(ubuf))) return -EINVAL; err = shash->final(desc, buf); if (err) goto out; memcpy(out, buf, ds); out: memset(buf, 0, ds); return err; } int crypto_shash_final(struct shash_desc *desc, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_inc(&shash_get_stat(shash)->hash_cnt); if ((unsigned long)out & alignmask) err = shash_final_unaligned(desc, out); else err = shash->final(desc, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_final); static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return shash_update_unaligned(desc, data, len) ?: shash_final_unaligned(desc, out); } int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = shash_get_stat(shash); atomic64_inc(&istat->hash_cnt); atomic64_add(len, &istat->hash_tlen); } if (((unsigned long)data | (unsigned long)out) & alignmask) err = shash_finup_unaligned(desc, data, len, out); else err = shash->finup(desc, data, len, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_finup); static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return crypto_shash_init(desc) ?: shash_update_unaligned(desc, data, len) ?: shash_final_unaligned(desc, out); } int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); unsigned long alignmask = crypto_shash_alignmask(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = shash_get_stat(shash); atomic64_inc(&istat->hash_cnt); atomic64_add(len, &istat->hash_tlen); } if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) err = -ENOKEY; else if (((unsigned long)data | (unsigned long)out) & alignmask) err = shash_digest_unaligned(desc, data, len, out); else err = shash->digest(desc, data, len, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_digest); int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out) { SHASH_DESC_ON_STACK(desc, tfm); int err; desc->tfm = tfm; err = crypto_shash_digest(desc, data, len, out); shash_desc_zero(desc); return err; } EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest); static int shash_default_export(struct shash_desc *desc, void *out) { memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm)); return 0; } static int shash_default_import(struct shash_desc *desc, const void *in) { memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(desc->tfm)); return 0; } static int shash_async_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { struct crypto_shash **ctx = crypto_ahash_ctx(tfm); return crypto_shash_setkey(*ctx, key, keylen); } static int shash_async_init(struct ahash_request *req) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return crypto_shash_init(desc); } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0; nbytes = crypto_hash_walk_done(&walk, nbytes)) nbytes = crypto_shash_update(desc, walk.data, nbytes); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_update); static int shash_async_update(struct ahash_request *req) { return shash_ahash_update(req, ahash_request_ctx(req)); } static int shash_async_final(struct ahash_request *req) { return crypto_shash_final(ahash_request_ctx(req), req->result); } int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; nbytes = crypto_hash_walk_first(req, &walk); if (!nbytes) return crypto_shash_final(desc, req->result); do { nbytes = crypto_hash_walk_last(&walk) ? crypto_shash_finup(desc, walk.data, nbytes, req->result) : crypto_shash_update(desc, walk.data, nbytes); nbytes = crypto_hash_walk_done(&walk, nbytes); } while (nbytes > 0); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_finup); static int shash_async_finup(struct ahash_request *req) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return shash_ahash_finup(req, desc); } int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) { unsigned int nbytes = req->nbytes; struct scatterlist *sg; unsigned int offset; int err; if (nbytes && (sg = req->src, offset = sg->offset, nbytes <= min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) { void *data; data = kmap_local_page(sg_page(sg)); err = crypto_shash_digest(desc, data + offset, nbytes, req->result); kunmap_local(data); } else err = crypto_shash_init(desc) ?: shash_ahash_finup(req, desc); return err; } EXPORT_SYMBOL_GPL(shash_ahash_digest); static int shash_async_digest(struct ahash_request *req) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return shash_ahash_digest(req, desc); } static int shash_async_export(struct ahash_request *req, void *out) { return crypto_shash_export(ahash_request_ctx(req), out); } static int shash_async_import(struct ahash_request *req, const void *in) { struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = *ctx; return crypto_shash_import(desc, in); } static void crypto_exit_shash_ops_async(struct crypto_tfm *tfm) { struct crypto_shash **ctx = crypto_tfm_ctx(tfm); crypto_free_shash(*ctx); } int crypto_init_shash_ops_async(struct crypto_tfm *tfm) { struct crypto_alg *calg = tfm->__crt_alg; struct shash_alg *alg = __crypto_shash_alg(calg); struct crypto_ahash *crt = __crypto_ahash_cast(tfm); struct crypto_shash **ctx = crypto_tfm_ctx(tfm); struct crypto_shash *shash; if (!crypto_mod_get(calg)) return -EAGAIN; shash = crypto_create_tfm(calg, &crypto_shash_type); if (IS_ERR(shash)) { crypto_mod_put(calg); return PTR_ERR(shash); } *ctx = shash; tfm->exit = crypto_exit_shash_ops_async; crt->init = shash_async_init; crt->update = shash_async_update; crt->final = shash_async_final; crt->finup = shash_async_finup; crt->digest = shash_async_digest; if (crypto_shash_alg_has_setkey(alg)) crt->setkey = shash_async_setkey; crypto_ahash_set_flags(crt, crypto_shash_get_flags(shash) & CRYPTO_TFM_NEED_KEY); crt->export = shash_async_export; crt->import = shash_async_import; crt->reqsize = sizeof(struct shash_desc) + crypto_shash_descsize(shash); return 0; } struct crypto_ahash *crypto_clone_shash_ops_async(struct crypto_ahash *nhash, struct crypto_ahash *hash) { struct crypto_shash **nctx = crypto_ahash_ctx(nhash); struct crypto_shash **ctx = crypto_ahash_ctx(hash); struct crypto_shash *shash; shash = crypto_clone_shash(*ctx); if (IS_ERR(shash)) { crypto_free_ahash(nhash); return ERR_CAST(shash); } *nctx = shash; return nhash; } static void crypto_shash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); alg->exit_tfm(hash); } static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); int err; hash->descsize = alg->descsize; shash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_shash_exit_tfm; if (!alg->init_tfm) return 0; err = alg->init_tfm(hash); if (err) return err; /* ->init_tfm() may have increased the descsize. */ if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) { if (alg->exit_tfm) alg->exit_tfm(hash); return -EINVAL; } return 0; } static void crypto_shash_free_instance(struct crypto_instance *inst) { struct shash_instance *shash = shash_instance(inst); shash->free(shash); } static int __maybe_unused crypto_shash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) { struct shash_alg *salg = __crypto_shash_alg(alg); seq_printf(m, "type : shash\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", salg->digestsize); } static int __maybe_unused crypto_shash_report_stat( struct sk_buff *skb, struct crypto_alg *alg) { return crypto_hash_report_stat(skb, alg, "shash"); } static const struct crypto_type crypto_shash_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_shash_init_tfm, .free = crypto_shash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_shash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_shash_report, #endif #ifdef CONFIG_CRYPTO_STATS .report_stat = crypto_shash_report_stat, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SHASH, .tfmsize = offsetof(struct crypto_shash, base), }; int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_shash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_shash); struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_shash); int crypto_has_shash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_shash); struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash) { struct crypto_tfm *tfm = crypto_shash_tfm(hash); struct shash_alg *alg = crypto_shash_alg(hash); struct crypto_shash *nhash; int err; if (!crypto_shash_alg_has_setkey(alg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init)) return ERR_PTR(-ENOSYS); nhash = crypto_clone_tfm(&crypto_shash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->descsize = hash->descsize; if (alg->clone_tfm) { err = alg->clone_tfm(nhash, hash); if (err) { crypto_free_shash(nhash); return ERR_PTR(err); } } return nhash; } EXPORT_SYMBOL_GPL(crypto_clone_shash); int hash_prepare_alg(struct hash_alg_common *alg) { struct crypto_istat_hash *istat = hash_get_stat(alg); struct crypto_alg *base = &alg->base; if (alg->digestsize > HASH_MAX_DIGESTSIZE) return -EINVAL; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) memset(istat, 0, sizeof(*istat)); return 0; } static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->descsize > HASH_MAX_DESCSIZE) return -EINVAL; if (base->cra_alignmask > MAX_SHASH_ALIGNMASK) return -EINVAL; if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_shash_type; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; if (!alg->finup) alg->finup = shash_finup_unaligned; if (!alg->digest) alg->digest = shash_digest_unaligned; if (!alg->export) { alg->export = shash_default_export; alg->import = shash_default_import; alg->halg.statesize = alg->descsize; } if (!alg->setkey) alg->setkey = shash_no_setkey; return 0; } int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = shash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_shash); void crypto_unregister_shash(struct shash_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_shash); int crypto_register_shashes(struct shash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_shash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_shash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_shashes); void crypto_unregister_shashes(struct shash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_shash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_shashes); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = shash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, shash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(shash_register_instance); void shash_free_singlespawn_instance(struct shash_instance *inst) { crypto_drop_spawn(shash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous cryptographic hash type"); |
| 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | #undef TRACE_SYSTEM #define TRACE_SYSTEM netlink #if !defined(_TRACE_NETLINK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NETLINK_H #include <linux/tracepoint.h> TRACE_EVENT(netlink_extack, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string( msg, msg ) ), TP_fast_assign( __assign_str(msg, msg); ), TP_printk("msg=%s", __get_str(msg)) ); #endif /* _TRACE_NETLINK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 138 138 138 138 138 138 138 138 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor security identifier (secid) manipulation fns * * Copyright 2009-2017 Canonical Ltd. * * AppArmor allocates a unique secid for every label used. If a label * is replaced it receives the secid of the label it is replacing. */ #include <linux/errno.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> #include "include/cred.h" #include "include/lib.h" #include "include/secid.h" #include "include/label.h" #include "include/policy_ns.h" /* * secids - do not pin labels with a refcount. They rely on the label * properly updating/freeing them */ #define AA_FIRST_SECID 2 static DEFINE_XARRAY_FLAGS(aa_secids, XA_FLAGS_LOCK_IRQ | XA_FLAGS_TRACK_FREE); int apparmor_display_secid_mode; /* * TODO: allow policy to reserve a secid range? * TODO: add secid pinning * TODO: use secid_update in label replace */ /** * aa_secid_update - update a secid mapping to a new label * @secid: secid to update * @label: label the secid will now map to */ void aa_secid_update(u32 secid, struct aa_label *label) { unsigned long flags; xa_lock_irqsave(&aa_secids, flags); __xa_store(&aa_secids, secid, label, 0); xa_unlock_irqrestore(&aa_secids, flags); } /* * see label for inverse aa_label_to_secid */ struct aa_label *aa_secid_to_label(u32 secid) { return xa_load(&aa_secids, secid); } int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { /* TODO: cache secctx and ref count so we don't have to recreate */ struct aa_label *label = aa_secid_to_label(secid); int flags = FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT; int len; AA_BUG(!seclen); if (!label) return -EINVAL; if (apparmor_display_secid_mode) flags |= FLAG_SHOW_MODE; if (secdata) len = aa_label_asxprint(secdata, root_ns, label, flags, GFP_ATOMIC); else len = aa_label_snxprint(NULL, 0, root_ns, label, flags); if (len < 0) return -ENOMEM; *seclen = len; return 0; } int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { struct aa_label *label; label = aa_label_strn_parse(&root_ns->unconfined->label, secdata, seclen, GFP_KERNEL, false, false); if (IS_ERR(label)) return PTR_ERR(label); *secid = label->secid; return 0; } void apparmor_release_secctx(char *secdata, u32 seclen) { kfree(secdata); } /** * aa_alloc_secid - allocate a new secid for a profile * @label: the label to allocate a secid for * @gfp: memory allocation flags * * Returns: 0 with @label->secid initialized * <0 returns error with @label->secid set to AA_SECID_INVALID */ int aa_alloc_secid(struct aa_label *label, gfp_t gfp) { unsigned long flags; int ret; xa_lock_irqsave(&aa_secids, flags); ret = __xa_alloc(&aa_secids, &label->secid, label, XA_LIMIT(AA_FIRST_SECID, INT_MAX), gfp); xa_unlock_irqrestore(&aa_secids, flags); if (ret < 0) { label->secid = AA_SECID_INVALID; return ret; } return 0; } /** * aa_free_secid - free a secid * @secid: secid to free */ void aa_free_secid(u32 secid) { unsigned long flags; xa_lock_irqsave(&aa_secids, flags); __xa_erase(&aa_secids, secid); xa_unlock_irqrestore(&aa_secids, flags); } |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | // SPDX-License-Identifier: GPL-2.0-only /* * LCD Lowlevel Control Abstraction * * Copyright (C) 2003,2004 Hewlett-Packard Company * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/lcd.h> #include <linux/notifier.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/fb.h> #include <linux/slab.h> #if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \ defined(CONFIG_LCD_CLASS_DEVICE_MODULE)) /* This callback gets called when something important happens inside a * framebuffer driver. We're looking if that important event is blanking, * and if it is, we're switching lcd power as well ... */ static int fb_notifier_callback(struct notifier_block *self, unsigned long event, void *data) { struct lcd_device *ld; struct fb_event *evdata = data; ld = container_of(self, struct lcd_device, fb_notif); if (!ld->ops) return 0; mutex_lock(&ld->ops_lock); if (!ld->ops->check_fb || ld->ops->check_fb(ld, evdata->info)) { if (event == FB_EVENT_BLANK) { if (ld->ops->set_power) ld->ops->set_power(ld, *(int *)evdata->data); } else { if (ld->ops->set_mode) ld->ops->set_mode(ld, evdata->data); } } mutex_unlock(&ld->ops_lock); return 0; } static int lcd_register_fb(struct lcd_device *ld) { memset(&ld->fb_notif, 0, sizeof(ld->fb_notif)); ld->fb_notif.notifier_call = fb_notifier_callback; return fb_register_client(&ld->fb_notif); } static void lcd_unregister_fb(struct lcd_device *ld) { fb_unregister_client(&ld->fb_notif); } #else static int lcd_register_fb(struct lcd_device *ld) { return 0; } static inline void lcd_unregister_fb(struct lcd_device *ld) { } #endif /* CONFIG_FB */ static ssize_t lcd_power_show(struct device *dev, struct device_attribute *attr, char *buf) { int rc; struct lcd_device *ld = to_lcd_device(dev); mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->get_power) rc = sprintf(buf, "%d\n", ld->ops->get_power(ld)); else rc = -ENXIO; mutex_unlock(&ld->ops_lock); return rc; } static ssize_t lcd_power_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc; struct lcd_device *ld = to_lcd_device(dev); unsigned long power; rc = kstrtoul(buf, 0, &power); if (rc) return rc; rc = -ENXIO; mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->set_power) { pr_debug("set power to %lu\n", power); ld->ops->set_power(ld, power); rc = count; } mutex_unlock(&ld->ops_lock); return rc; } static DEVICE_ATTR_RW(lcd_power); static ssize_t contrast_show(struct device *dev, struct device_attribute *attr, char *buf) { int rc = -ENXIO; struct lcd_device *ld = to_lcd_device(dev); mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->get_contrast) rc = sprintf(buf, "%d\n", ld->ops->get_contrast(ld)); mutex_unlock(&ld->ops_lock); return rc; } static ssize_t contrast_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc; struct lcd_device *ld = to_lcd_device(dev); unsigned long contrast; rc = kstrtoul(buf, 0, &contrast); if (rc) return rc; rc = -ENXIO; mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->set_contrast) { pr_debug("set contrast to %lu\n", contrast); ld->ops->set_contrast(ld, contrast); rc = count; } mutex_unlock(&ld->ops_lock); return rc; } static DEVICE_ATTR_RW(contrast); static ssize_t max_contrast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct lcd_device *ld = to_lcd_device(dev); return sprintf(buf, "%d\n", ld->props.max_contrast); } static DEVICE_ATTR_RO(max_contrast); static struct class *lcd_class; static void lcd_device_release(struct device *dev) { struct lcd_device *ld = to_lcd_device(dev); kfree(ld); } static struct attribute *lcd_device_attrs[] = { &dev_attr_lcd_power.attr, &dev_attr_contrast.attr, &dev_attr_max_contrast.attr, NULL, }; ATTRIBUTE_GROUPS(lcd_device); /** * lcd_device_register - register a new object of lcd_device class. * @name: the name of the new object(must be the same as the name of the * respective framebuffer device). * @parent: pointer to the parent's struct device . * @devdata: an optional pointer to be stored in the device. The * methods may retrieve it by using lcd_get_data(ld). * @ops: the lcd operations structure. * * Creates and registers a new lcd device. Returns either an ERR_PTR() * or a pointer to the newly allocated device. */ struct lcd_device *lcd_device_register(const char *name, struct device *parent, void *devdata, struct lcd_ops *ops) { struct lcd_device *new_ld; int rc; pr_debug("lcd_device_register: name=%s\n", name); new_ld = kzalloc(sizeof(struct lcd_device), GFP_KERNEL); if (!new_ld) return ERR_PTR(-ENOMEM); mutex_init(&new_ld->ops_lock); mutex_init(&new_ld->update_lock); new_ld->dev.class = lcd_class; new_ld->dev.parent = parent; new_ld->dev.release = lcd_device_release; dev_set_name(&new_ld->dev, "%s", name); dev_set_drvdata(&new_ld->dev, devdata); new_ld->ops = ops; rc = device_register(&new_ld->dev); if (rc) { put_device(&new_ld->dev); return ERR_PTR(rc); } rc = lcd_register_fb(new_ld); if (rc) { device_unregister(&new_ld->dev); return ERR_PTR(rc); } return new_ld; } EXPORT_SYMBOL(lcd_device_register); /** * lcd_device_unregister - unregisters a object of lcd_device class. * @ld: the lcd device object to be unregistered and freed. * * Unregisters a previously registered via lcd_device_register object. */ void lcd_device_unregister(struct lcd_device *ld) { if (!ld) return; mutex_lock(&ld->ops_lock); ld->ops = NULL; mutex_unlock(&ld->ops_lock); lcd_unregister_fb(ld); device_unregister(&ld->dev); } EXPORT_SYMBOL(lcd_device_unregister); static void devm_lcd_device_release(struct device *dev, void *res) { struct lcd_device *lcd = *(struct lcd_device **)res; lcd_device_unregister(lcd); } static int devm_lcd_device_match(struct device *dev, void *res, void *data) { struct lcd_device **r = res; return *r == data; } /** * devm_lcd_device_register - resource managed lcd_device_register() * @dev: the device to register * @name: the name of the device * @parent: a pointer to the parent device * @devdata: an optional pointer to be stored for private driver use * @ops: the lcd operations structure * * @return a struct lcd on success, or an ERR_PTR on error * * Managed lcd_device_register(). The lcd_device returned from this function * are automatically freed on driver detach. See lcd_device_register() * for more information. */ struct lcd_device *devm_lcd_device_register(struct device *dev, const char *name, struct device *parent, void *devdata, struct lcd_ops *ops) { struct lcd_device **ptr, *lcd; ptr = devres_alloc(devm_lcd_device_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); lcd = lcd_device_register(name, parent, devdata, ops); if (!IS_ERR(lcd)) { *ptr = lcd; devres_add(dev, ptr); } else { devres_free(ptr); } return lcd; } EXPORT_SYMBOL(devm_lcd_device_register); /** * devm_lcd_device_unregister - resource managed lcd_device_unregister() * @dev: the device to unregister * @ld: the lcd device to unregister * * Deallocated a lcd allocated with devm_lcd_device_register(). Normally * this function will not need to be called and the resource management * code will ensure that the resource is freed. */ void devm_lcd_device_unregister(struct device *dev, struct lcd_device *ld) { int rc; rc = devres_release(dev, devm_lcd_device_release, devm_lcd_device_match, ld); WARN_ON(rc); } EXPORT_SYMBOL(devm_lcd_device_unregister); static void __exit lcd_class_exit(void) { class_destroy(lcd_class); } static int __init lcd_class_init(void) { lcd_class = class_create("lcd"); if (IS_ERR(lcd_class)) { pr_warn("Unable to create backlight class; errno = %ld\n", PTR_ERR(lcd_class)); return PTR_ERR(lcd_class); } lcd_class->dev_groups = lcd_device_groups; return 0; } /* * if this is compiled into the kernel, we need to ensure that the * class is registered before users of the class try to register lcd's */ postcore_initcall(lcd_class_init); module_exit(lcd_class_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jamey Hicks <jamey.hicks@hp.com>, Andrew Zabolotny <zap@homelink.ru>"); MODULE_DESCRIPTION("LCD Lowlevel Control Abstraction"); |
| 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 | // SPDX-License-Identifier: GPL-2.0 /* * Prolific PL2303 USB to serial adaptor driver * * Copyright (C) 2001-2007 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2003 IBM Corp. * * Original driver for 2.2.x by anonymous * * See Documentation/usb/usb-serial.rst for more information on using this * driver */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <asm/unaligned.h> #include "pl2303.h" #define PL2303_QUIRK_UART_STATE_IDX0 BIT(0) #define PL2303_QUIRK_LEGACY BIT(1) #define PL2303_QUIRK_ENDPOINT_HACK BIT(2) static const struct usb_device_id id_table[] = { { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID), .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_RSAQ2) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_DCU11) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_RSAQ3) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_CHILITAG) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_PHAROS) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_ALDIGA) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_MMX) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GPRS) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_HCR331) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_MOTOROLA) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_ZTEK) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_TB) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GC) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GB) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GT) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GL) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GE) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_GS) }, { USB_DEVICE(IODATA_VENDOR_ID, IODATA_PRODUCT_ID) }, { USB_DEVICE(IODATA_VENDOR_ID, IODATA_PRODUCT_ID_RSAQ5) }, { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID), .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_UC485), .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_UC232B), .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID2) }, { USB_DEVICE(ATEN_VENDOR_ID2, ATEN_PRODUCT_ID) }, { USB_DEVICE(ELCOM_VENDOR_ID, ELCOM_PRODUCT_ID) }, { USB_DEVICE(ELCOM_VENDOR_ID, ELCOM_PRODUCT_ID_UCSGT) }, { USB_DEVICE(ITEGNO_VENDOR_ID, ITEGNO_PRODUCT_ID) }, { USB_DEVICE(ITEGNO_VENDOR_ID, ITEGNO_PRODUCT_ID_2080) }, { USB_DEVICE(MA620_VENDOR_ID, MA620_PRODUCT_ID) }, { USB_DEVICE(RATOC_VENDOR_ID, RATOC_PRODUCT_ID) }, { USB_DEVICE(TRIPP_VENDOR_ID, TRIPP_PRODUCT_ID) }, { USB_DEVICE(RADIOSHACK_VENDOR_ID, RADIOSHACK_PRODUCT_ID) }, { USB_DEVICE(DCU10_VENDOR_ID, DCU10_PRODUCT_ID) }, { USB_DEVICE(SITECOM_VENDOR_ID, SITECOM_PRODUCT_ID) }, { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_ID) }, { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_SX1), .driver_info = PL2303_QUIRK_UART_STATE_IDX0 }, { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_X65), .driver_info = PL2303_QUIRK_UART_STATE_IDX0 }, { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_X75), .driver_info = PL2303_QUIRK_UART_STATE_IDX0 }, { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_EF81), .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_ID_S81) }, /* Benq/Siemens S81 */ { USB_DEVICE(SYNTECH_VENDOR_ID, SYNTECH_PRODUCT_ID) }, { USB_DEVICE(NOKIA_CA42_VENDOR_ID, NOKIA_CA42_PRODUCT_ID) }, { USB_DEVICE(CA_42_CA42_VENDOR_ID, CA_42_CA42_PRODUCT_ID) }, { USB_DEVICE(SAGEM_VENDOR_ID, SAGEM_PRODUCT_ID) }, { USB_DEVICE(LEADTEK_VENDOR_ID, LEADTEK_9531_PRODUCT_ID) }, { USB_DEVICE(SPEEDDRAGON_VENDOR_ID, SPEEDDRAGON_PRODUCT_ID) }, { USB_DEVICE(DATAPILOT_U2_VENDOR_ID, DATAPILOT_U2_PRODUCT_ID) }, { USB_DEVICE(BELKIN_VENDOR_ID, BELKIN_PRODUCT_ID) }, { USB_DEVICE(ALCOR_VENDOR_ID, ALCOR_PRODUCT_ID), .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(WS002IN_VENDOR_ID, WS002IN_PRODUCT_ID) }, { USB_DEVICE(COREGA_VENDOR_ID, COREGA_PRODUCT_ID) }, { USB_DEVICE(YCCABLE_VENDOR_ID, YCCABLE_PRODUCT_ID) }, { USB_DEVICE(SUPERIAL_VENDOR_ID, SUPERIAL_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LD220_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LD220TA_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LD381_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LD381GC_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LD960_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LD960TA_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LCM220_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LCM960_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LM920_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LM930_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_LM940_PRODUCT_ID) }, { USB_DEVICE(HP_VENDOR_ID, HP_TD620_PRODUCT_ID) }, { USB_DEVICE(CRESSI_VENDOR_ID, CRESSI_EDY_PRODUCT_ID) }, { USB_DEVICE(ZEAGLE_VENDOR_ID, ZEAGLE_N2ITION3_PRODUCT_ID) }, { USB_DEVICE(SONY_VENDOR_ID, SONY_QN3USB_PRODUCT_ID) }, { USB_DEVICE(SANWA_VENDOR_ID, SANWA_PRODUCT_ID) }, { USB_DEVICE(ADLINK_VENDOR_ID, ADLINK_ND6530_PRODUCT_ID) }, { USB_DEVICE(ADLINK_VENDOR_ID, ADLINK_ND6530GC_PRODUCT_ID) }, { USB_DEVICE(SMART_VENDOR_ID, SMART_PRODUCT_ID) }, { USB_DEVICE(AT_VENDOR_ID, AT_VTKIT3_PRODUCT_ID) }, { USB_DEVICE(IBM_VENDOR_ID, IBM_PRODUCT_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); #define SET_LINE_REQUEST_TYPE 0x21 #define SET_LINE_REQUEST 0x20 #define SET_CONTROL_REQUEST_TYPE 0x21 #define SET_CONTROL_REQUEST 0x22 #define CONTROL_DTR 0x01 #define CONTROL_RTS 0x02 #define BREAK_REQUEST_TYPE 0x21 #define BREAK_REQUEST 0x23 #define BREAK_ON 0xffff #define BREAK_OFF 0x0000 #define GET_LINE_REQUEST_TYPE 0xa1 #define GET_LINE_REQUEST 0x21 #define VENDOR_WRITE_REQUEST_TYPE 0x40 #define VENDOR_WRITE_REQUEST 0x01 #define VENDOR_WRITE_NREQUEST 0x80 #define VENDOR_READ_REQUEST_TYPE 0xc0 #define VENDOR_READ_REQUEST 0x01 #define VENDOR_READ_NREQUEST 0x81 #define UART_STATE_INDEX 8 #define UART_STATE_MSR_MASK 0x8b #define UART_STATE_TRANSIENT_MASK 0x74 #define UART_DCD 0x01 #define UART_DSR 0x02 #define UART_BREAK_ERROR 0x04 #define UART_RING 0x08 #define UART_FRAME_ERROR 0x10 #define UART_PARITY_ERROR 0x20 #define UART_OVERRUN_ERROR 0x40 #define UART_CTS 0x80 #define PL2303_FLOWCTRL_MASK 0xf0 #define PL2303_READ_TYPE_HX_STATUS 0x8080 #define PL2303_HXN_RESET_REG 0x07 #define PL2303_HXN_RESET_UPSTREAM_PIPE 0x02 #define PL2303_HXN_RESET_DOWNSTREAM_PIPE 0x01 #define PL2303_HXN_FLOWCTRL_REG 0x0a #define PL2303_HXN_FLOWCTRL_MASK 0x1c #define PL2303_HXN_FLOWCTRL_NONE 0x1c #define PL2303_HXN_FLOWCTRL_RTS_CTS 0x18 #define PL2303_HXN_FLOWCTRL_XON_XOFF 0x0c static int pl2303_set_break(struct usb_serial_port *port, bool enable); enum pl2303_type { TYPE_H, TYPE_HX, TYPE_TA, TYPE_TB, TYPE_HXD, TYPE_HXN, TYPE_COUNT }; struct pl2303_type_data { const char *name; speed_t max_baud_rate; unsigned long quirks; unsigned int no_autoxonxoff:1; unsigned int no_divisors:1; unsigned int alt_divisors:1; }; struct pl2303_serial_private { const struct pl2303_type_data *type; unsigned long quirks; }; struct pl2303_private { spinlock_t lock; u8 line_control; u8 line_status; u8 line_settings[7]; }; static const struct pl2303_type_data pl2303_type_data[TYPE_COUNT] = { [TYPE_H] = { .name = "H", .max_baud_rate = 1228800, .quirks = PL2303_QUIRK_LEGACY, .no_autoxonxoff = true, }, [TYPE_HX] = { .name = "HX", .max_baud_rate = 6000000, }, [TYPE_TA] = { .name = "TA", .max_baud_rate = 6000000, .alt_divisors = true, }, [TYPE_TB] = { .name = "TB", .max_baud_rate = 12000000, .alt_divisors = true, }, [TYPE_HXD] = { .name = "HXD", .max_baud_rate = 12000000, }, [TYPE_HXN] = { .name = "G", .max_baud_rate = 12000000, .no_divisors = true, }, }; static int pl2303_vendor_read(struct usb_serial *serial, u16 value, unsigned char buf[1]) { struct pl2303_serial_private *spriv = usb_get_serial_data(serial); struct device *dev = &serial->interface->dev; u8 request; int res; if (spriv->type == &pl2303_type_data[TYPE_HXN]) request = VENDOR_READ_NREQUEST; else request = VENDOR_READ_REQUEST; res = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), request, VENDOR_READ_REQUEST_TYPE, value, 0, buf, 1, 100); if (res != 1) { dev_err(dev, "%s - failed to read [%04x]: %d\n", __func__, value, res); if (res >= 0) res = -EIO; return res; } dev_dbg(dev, "%s - [%04x] = %02x\n", __func__, value, buf[0]); return 0; } static int pl2303_vendor_write(struct usb_serial *serial, u16 value, u16 index) { struct pl2303_serial_private *spriv = usb_get_serial_data(serial); struct device *dev = &serial->interface->dev; u8 request; int res; dev_dbg(dev, "%s - [%04x] = %02x\n", __func__, value, index); if (spriv->type == &pl2303_type_data[TYPE_HXN]) request = VENDOR_WRITE_NREQUEST; else request = VENDOR_WRITE_REQUEST; res = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), request, VENDOR_WRITE_REQUEST_TYPE, value, index, NULL, 0, 100); if (res) { dev_err(dev, "%s - failed to write [%04x]: %d\n", __func__, value, res); return res; } return 0; } static int pl2303_update_reg(struct usb_serial *serial, u8 reg, u8 mask, u8 val) { struct pl2303_serial_private *spriv = usb_get_serial_data(serial); int ret = 0; u8 *buf; buf = kmalloc(1, GFP_KERNEL); if (!buf) return -ENOMEM; if (spriv->type == &pl2303_type_data[TYPE_HXN]) ret = pl2303_vendor_read(serial, reg, buf); else ret = pl2303_vendor_read(serial, reg | 0x80, buf); if (ret) goto out_free; *buf &= ~mask; *buf |= val & mask; ret = pl2303_vendor_write(serial, reg, *buf); out_free: kfree(buf); return ret; } static int pl2303_probe(struct usb_serial *serial, const struct usb_device_id *id) { usb_set_serial_data(serial, (void *)id->driver_info); return 0; } /* * Use interrupt endpoint from first interface if available. * * This is needed due to the looney way its endpoints are set up. */ static int pl2303_endpoint_hack(struct usb_serial *serial, struct usb_serial_endpoints *epds) { struct usb_interface *interface = serial->interface; struct usb_device *dev = serial->dev; struct device *ddev = &interface->dev; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; unsigned int i; if (interface == dev->actconfig->interface[0]) return 0; /* check out the endpoints of the other interface */ iface_desc = dev->actconfig->interface[0]->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; if (!usb_endpoint_is_int_in(endpoint)) continue; dev_dbg(ddev, "found interrupt in on separate interface\n"); if (epds->num_interrupt_in < ARRAY_SIZE(epds->interrupt_in)) epds->interrupt_in[epds->num_interrupt_in++] = endpoint; } return 0; } static int pl2303_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { unsigned long quirks = (unsigned long)usb_get_serial_data(serial); struct device *dev = &serial->interface->dev; int ret; if (quirks & PL2303_QUIRK_ENDPOINT_HACK) { ret = pl2303_endpoint_hack(serial, epds); if (ret) return ret; } if (epds->num_interrupt_in < 1) { dev_err(dev, "required interrupt-in endpoint missing\n"); return -ENODEV; } return 1; } static bool pl2303_supports_hx_status(struct usb_serial *serial) { int ret; u8 buf; ret = usb_control_msg_recv(serial->dev, 0, VENDOR_READ_REQUEST, VENDOR_READ_REQUEST_TYPE, PL2303_READ_TYPE_HX_STATUS, 0, &buf, 1, 100, GFP_KERNEL); return ret == 0; } static int pl2303_detect_type(struct usb_serial *serial) { struct usb_device_descriptor *desc = &serial->dev->descriptor; u16 bcdDevice, bcdUSB; /* * Legacy PL2303H, variants 0 and 1 (difference unknown). */ if (desc->bDeviceClass == 0x02) return TYPE_H; /* variant 0 */ if (desc->bMaxPacketSize0 != 0x40) { if (desc->bDeviceClass == 0x00 || desc->bDeviceClass == 0xff) return TYPE_H; /* variant 1 */ return TYPE_H; /* variant 0 */ } bcdDevice = le16_to_cpu(desc->bcdDevice); bcdUSB = le16_to_cpu(desc->bcdUSB); switch (bcdUSB) { case 0x101: /* USB 1.0.1? Let's assume they meant 1.1... */ fallthrough; case 0x110: switch (bcdDevice) { case 0x300: return TYPE_HX; case 0x400: return TYPE_HXD; default: return TYPE_HX; } break; case 0x200: switch (bcdDevice) { case 0x100: /* GC */ case 0x105: return TYPE_HXN; case 0x300: /* GT / TA */ if (pl2303_supports_hx_status(serial)) return TYPE_TA; fallthrough; case 0x305: case 0x400: /* GL */ case 0x405: return TYPE_HXN; case 0x500: /* GE / TB */ if (pl2303_supports_hx_status(serial)) return TYPE_TB; fallthrough; case 0x505: case 0x600: /* GS */ case 0x605: case 0x700: /* GR */ case 0x705: return TYPE_HXN; } break; } dev_err(&serial->interface->dev, "unknown device type, please report to linux-usb@vger.kernel.org\n"); return -ENODEV; } static int pl2303_startup(struct usb_serial *serial) { struct pl2303_serial_private *spriv; enum pl2303_type type; unsigned char *buf; int ret; ret = pl2303_detect_type(serial); if (ret < 0) return ret; type = ret; dev_dbg(&serial->interface->dev, "device type: %s\n", pl2303_type_data[type].name); spriv = kzalloc(sizeof(*spriv), GFP_KERNEL); if (!spriv) return -ENOMEM; spriv->type = &pl2303_type_data[type]; spriv->quirks = (unsigned long)usb_get_serial_data(serial); spriv->quirks |= spriv->type->quirks; usb_set_serial_data(serial, spriv); if (type != TYPE_HXN) { buf = kmalloc(1, GFP_KERNEL); if (!buf) { kfree(spriv); return -ENOMEM; } pl2303_vendor_read(serial, 0x8484, buf); pl2303_vendor_write(serial, 0x0404, 0); pl2303_vendor_read(serial, 0x8484, buf); pl2303_vendor_read(serial, 0x8383, buf); pl2303_vendor_read(serial, 0x8484, buf); pl2303_vendor_write(serial, 0x0404, 1); pl2303_vendor_read(serial, 0x8484, buf); pl2303_vendor_read(serial, 0x8383, buf); pl2303_vendor_write(serial, 0, 1); pl2303_vendor_write(serial, 1, 0); if (spriv->quirks & PL2303_QUIRK_LEGACY) pl2303_vendor_write(serial, 2, 0x24); else pl2303_vendor_write(serial, 2, 0x44); kfree(buf); } return 0; } static void pl2303_release(struct usb_serial *serial) { struct pl2303_serial_private *spriv = usb_get_serial_data(serial); kfree(spriv); } static int pl2303_port_probe(struct usb_serial_port *port) { struct pl2303_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; spin_lock_init(&priv->lock); usb_set_serial_port_data(port, priv); port->port.drain_delay = 256; return 0; } static void pl2303_port_remove(struct usb_serial_port *port) { struct pl2303_private *priv = usb_get_serial_port_data(port); kfree(priv); } static int pl2303_set_control_lines(struct usb_serial_port *port, u8 value) { struct usb_device *dev = port->serial->dev; int retval; dev_dbg(&port->dev, "%s - %02x\n", __func__, value); retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), SET_CONTROL_REQUEST, SET_CONTROL_REQUEST_TYPE, value, 0, NULL, 0, 100); if (retval) dev_err(&port->dev, "%s - failed: %d\n", __func__, retval); return retval; } /* * Returns the nearest supported baud rate that can be set directly without * using divisors. */ static speed_t pl2303_get_supported_baud_rate(speed_t baud) { static const speed_t baud_sup[] = { 75, 150, 300, 600, 1200, 1800, 2400, 3600, 4800, 7200, 9600, 14400, 19200, 28800, 38400, 57600, 115200, 230400, 460800, 614400, 921600, 1228800, 2457600, 3000000, 6000000 }; unsigned i; for (i = 0; i < ARRAY_SIZE(baud_sup); ++i) { if (baud_sup[i] > baud) break; } if (i == ARRAY_SIZE(baud_sup)) baud = baud_sup[i - 1]; else if (i > 0 && (baud_sup[i] - baud) > (baud - baud_sup[i - 1])) baud = baud_sup[i - 1]; else baud = baud_sup[i]; return baud; } /* * NOTE: If unsupported baud rates are set directly, the PL2303 seems to * use 9600 baud. */ static speed_t pl2303_encode_baud_rate_direct(unsigned char buf[4], speed_t baud) { put_unaligned_le32(baud, buf); return baud; } static speed_t pl2303_encode_baud_rate_divisor(unsigned char buf[4], speed_t baud) { unsigned int baseline, mantissa, exponent; /* * Apparently the formula is: * baudrate = 12M * 32 / (mantissa * 4^exponent) * where * mantissa = buf[8:0] * exponent = buf[11:9] */ baseline = 12000000 * 32; mantissa = baseline / baud; if (mantissa == 0) mantissa = 1; /* Avoid dividing by zero if baud > 32*12M. */ exponent = 0; while (mantissa >= 512) { if (exponent < 7) { mantissa >>= 2; /* divide by 4 */ exponent++; } else { /* Exponent is maxed. Trim mantissa and leave. */ mantissa = 511; break; } } buf[3] = 0x80; buf[2] = 0; buf[1] = exponent << 1 | mantissa >> 8; buf[0] = mantissa & 0xff; /* Calculate and return the exact baud rate. */ baud = (baseline / mantissa) >> (exponent << 1); return baud; } static speed_t pl2303_encode_baud_rate_divisor_alt(unsigned char buf[4], speed_t baud) { unsigned int baseline, mantissa, exponent; /* * Apparently, for the TA version the formula is: * baudrate = 12M * 32 / (mantissa * 2^exponent) * where * mantissa = buf[10:0] * exponent = buf[15:13 16] */ baseline = 12000000 * 32; mantissa = baseline / baud; if (mantissa == 0) mantissa = 1; /* Avoid dividing by zero if baud > 32*12M. */ exponent = 0; while (mantissa >= 2048) { if (exponent < 15) { mantissa >>= 1; /* divide by 2 */ exponent++; } else { /* Exponent is maxed. Trim mantissa and leave. */ mantissa = 2047; break; } } buf[3] = 0x80; buf[2] = exponent & 0x01; buf[1] = (exponent & ~0x01) << 4 | mantissa >> 8; buf[0] = mantissa & 0xff; /* Calculate and return the exact baud rate. */ baud = (baseline / mantissa) >> exponent; return baud; } static void pl2303_encode_baud_rate(struct tty_struct *tty, struct usb_serial_port *port, u8 buf[4]) { struct usb_serial *serial = port->serial; struct pl2303_serial_private *spriv = usb_get_serial_data(serial); speed_t baud_sup; speed_t baud; baud = tty_get_baud_rate(tty); dev_dbg(&port->dev, "baud requested = %u\n", baud); if (!baud) return; if (spriv->type->max_baud_rate) baud = min_t(speed_t, baud, spriv->type->max_baud_rate); /* * Use direct method for supported baud rates, otherwise use divisors. * Newer chip types do not support divisor encoding. */ if (spriv->type->no_divisors) baud_sup = baud; else baud_sup = pl2303_get_supported_baud_rate(baud); if (baud == baud_sup) baud = pl2303_encode_baud_rate_direct(buf, baud); else if (spriv->type->alt_divisors) baud = pl2303_encode_baud_rate_divisor_alt(buf, baud); else baud = pl2303_encode_baud_rate_divisor(buf, baud); /* Save resulting baud rate */ tty_encode_baud_rate(tty, baud, baud); dev_dbg(&port->dev, "baud set = %u\n", baud); } static int pl2303_get_line_request(struct usb_serial_port *port, unsigned char buf[7]) { struct usb_device *udev = port->serial->dev; int ret; ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), GET_LINE_REQUEST, GET_LINE_REQUEST_TYPE, 0, 0, buf, 7, 100); if (ret != 7) { dev_err(&port->dev, "%s - failed: %d\n", __func__, ret); if (ret >= 0) ret = -EIO; return ret; } dev_dbg(&port->dev, "%s - %7ph\n", __func__, buf); return 0; } static int pl2303_set_line_request(struct usb_serial_port *port, unsigned char buf[7]) { struct usb_device *udev = port->serial->dev; int ret; ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), SET_LINE_REQUEST, SET_LINE_REQUEST_TYPE, 0, 0, buf, 7, 100); if (ret < 0) { dev_err(&port->dev, "%s - failed: %d\n", __func__, ret); return ret; } dev_dbg(&port->dev, "%s - %7ph\n", __func__, buf); return 0; } static bool pl2303_termios_change(const struct ktermios *a, const struct ktermios *b) { bool ixon_change; ixon_change = ((a->c_iflag ^ b->c_iflag) & (IXON | IXANY)) || a->c_cc[VSTART] != b->c_cc[VSTART] || a->c_cc[VSTOP] != b->c_cc[VSTOP]; return tty_termios_hw_change(a, b) || ixon_change; } static bool pl2303_enable_xonxoff(struct tty_struct *tty, const struct pl2303_type_data *type) { if (!I_IXON(tty) || I_IXANY(tty)) return false; if (START_CHAR(tty) != 0x11 || STOP_CHAR(tty) != 0x13) return false; if (type->no_autoxonxoff) return false; return true; } static void pl2303_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_serial *serial = port->serial; struct pl2303_serial_private *spriv = usb_get_serial_data(serial); struct pl2303_private *priv = usb_get_serial_port_data(port); unsigned long flags; unsigned char *buf; int ret; u8 control; if (old_termios && !pl2303_termios_change(&tty->termios, old_termios)) return; buf = kzalloc(7, GFP_KERNEL); if (!buf) { /* Report back no change occurred */ if (old_termios) tty->termios = *old_termios; return; } pl2303_get_line_request(port, buf); buf[6] = tty_get_char_size(tty->termios.c_cflag); dev_dbg(&port->dev, "data bits = %d\n", buf[6]); /* For reference buf[0]:buf[3] baud rate value */ pl2303_encode_baud_rate(tty, port, &buf[0]); /* For reference buf[4]=0 is 1 stop bits */ /* For reference buf[4]=1 is 1.5 stop bits */ /* For reference buf[4]=2 is 2 stop bits */ if (C_CSTOPB(tty)) { /* * NOTE: Comply with "real" UARTs / RS232: * use 1.5 instead of 2 stop bits with 5 data bits */ if (C_CSIZE(tty) == CS5) { buf[4] = 1; dev_dbg(&port->dev, "stop bits = 1.5\n"); } else { buf[4] = 2; dev_dbg(&port->dev, "stop bits = 2\n"); } } else { buf[4] = 0; dev_dbg(&port->dev, "stop bits = 1\n"); } if (C_PARENB(tty)) { /* For reference buf[5]=0 is none parity */ /* For reference buf[5]=1 is odd parity */ /* For reference buf[5]=2 is even parity */ /* For reference buf[5]=3 is mark parity */ /* For reference buf[5]=4 is space parity */ if (C_PARODD(tty)) { if (C_CMSPAR(tty)) { buf[5] = 3; dev_dbg(&port->dev, "parity = mark\n"); } else { buf[5] = 1; dev_dbg(&port->dev, "parity = odd\n"); } } else { if (C_CMSPAR(tty)) { buf[5] = 4; dev_dbg(&port->dev, "parity = space\n"); } else { buf[5] = 2; dev_dbg(&port->dev, "parity = even\n"); } } } else { buf[5] = 0; dev_dbg(&port->dev, "parity = none\n"); } /* * Some PL2303 are known to lose bytes if you change serial settings * even to the same values as before. Thus we actually need to filter * in this specific case. * * Note that the tty_termios_hw_change check above is not sufficient * as a previously requested baud rate may differ from the one * actually used (and stored in old_termios). * * NOTE: No additional locking needed for line_settings as it is * only used in set_termios, which is serialised against itself. */ if (!old_termios || memcmp(buf, priv->line_settings, 7)) { ret = pl2303_set_line_request(port, buf); if (!ret) memcpy(priv->line_settings, buf, 7); } /* change control lines if we are switching to or from B0 */ spin_lock_irqsave(&priv->lock, flags); control = priv->line_control; if (C_BAUD(tty) == B0) priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS); else if (old_termios && (old_termios->c_cflag & CBAUD) == B0) priv->line_control |= (CONTROL_DTR | CONTROL_RTS); if (control != priv->line_control) { control = priv->line_control; spin_unlock_irqrestore(&priv->lock, flags); pl2303_set_control_lines(port, control); } else { spin_unlock_irqrestore(&priv->lock, flags); } if (C_CRTSCTS(tty)) { if (spriv->quirks & PL2303_QUIRK_LEGACY) { pl2303_update_reg(serial, 0, PL2303_FLOWCTRL_MASK, 0x40); } else if (spriv->type == &pl2303_type_data[TYPE_HXN]) { pl2303_update_reg(serial, PL2303_HXN_FLOWCTRL_REG, PL2303_HXN_FLOWCTRL_MASK, PL2303_HXN_FLOWCTRL_RTS_CTS); } else { pl2303_update_reg(serial, 0, PL2303_FLOWCTRL_MASK, 0x60); } } else if (pl2303_enable_xonxoff(tty, spriv->type)) { if (spriv->type == &pl2303_type_data[TYPE_HXN]) { pl2303_update_reg(serial, PL2303_HXN_FLOWCTRL_REG, PL2303_HXN_FLOWCTRL_MASK, PL2303_HXN_FLOWCTRL_XON_XOFF); } else { pl2303_update_reg(serial, 0, PL2303_FLOWCTRL_MASK, 0xc0); } } else { if (spriv->type == &pl2303_type_data[TYPE_HXN]) { pl2303_update_reg(serial, PL2303_HXN_FLOWCTRL_REG, PL2303_HXN_FLOWCTRL_MASK, PL2303_HXN_FLOWCTRL_NONE); } else { pl2303_update_reg(serial, 0, PL2303_FLOWCTRL_MASK, 0); } } kfree(buf); } static void pl2303_dtr_rts(struct usb_serial_port *port, int on) { struct pl2303_private *priv = usb_get_serial_port_data(port); unsigned long flags; u8 control; spin_lock_irqsave(&priv->lock, flags); if (on) priv->line_control |= (CONTROL_DTR | CONTROL_RTS); else priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS); control = priv->line_control; spin_unlock_irqrestore(&priv->lock, flags); pl2303_set_control_lines(port, control); } static void pl2303_close(struct usb_serial_port *port) { usb_serial_generic_close(port); usb_kill_urb(port->interrupt_in_urb); pl2303_set_break(port, false); } static int pl2303_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct pl2303_serial_private *spriv = usb_get_serial_data(serial); int result; if (spriv->quirks & PL2303_QUIRK_LEGACY) { usb_clear_halt(serial->dev, port->write_urb->pipe); usb_clear_halt(serial->dev, port->read_urb->pipe); } else { /* reset upstream data pipes */ if (spriv->type == &pl2303_type_data[TYPE_HXN]) { pl2303_vendor_write(serial, PL2303_HXN_RESET_REG, PL2303_HXN_RESET_UPSTREAM_PIPE | PL2303_HXN_RESET_DOWNSTREAM_PIPE); } else { pl2303_vendor_write(serial, 8, 0); pl2303_vendor_write(serial, 9, 0); } } /* Setup termios */ if (tty) pl2303_set_termios(tty, port, NULL); result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) { dev_err(&port->dev, "failed to submit interrupt urb: %d\n", result); return result; } result = usb_serial_generic_open(tty, port); if (result) { usb_kill_urb(port->interrupt_in_urb); return result; } return 0; } static int pl2303_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct pl2303_private *priv = usb_get_serial_port_data(port); unsigned long flags; u8 control; int ret; spin_lock_irqsave(&priv->lock, flags); if (set & TIOCM_RTS) priv->line_control |= CONTROL_RTS; if (set & TIOCM_DTR) priv->line_control |= CONTROL_DTR; if (clear & TIOCM_RTS) priv->line_control &= ~CONTROL_RTS; if (clear & TIOCM_DTR) priv->line_control &= ~CONTROL_DTR; control = priv->line_control; spin_unlock_irqrestore(&priv->lock, flags); ret = pl2303_set_control_lines(port, control); if (ret) return usb_translate_errors(ret); return 0; } static int pl2303_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct pl2303_private *priv = usb_get_serial_port_data(port); unsigned long flags; unsigned int mcr; unsigned int status; unsigned int result; spin_lock_irqsave(&priv->lock, flags); mcr = priv->line_control; status = priv->line_status; spin_unlock_irqrestore(&priv->lock, flags); result = ((mcr & CONTROL_DTR) ? TIOCM_DTR : 0) | ((mcr & CONTROL_RTS) ? TIOCM_RTS : 0) | ((status & UART_CTS) ? TIOCM_CTS : 0) | ((status & UART_DSR) ? TIOCM_DSR : 0) | ((status & UART_RING) ? TIOCM_RI : 0) | ((status & UART_DCD) ? TIOCM_CD : 0); dev_dbg(&port->dev, "%s - result = %x\n", __func__, result); return result; } static int pl2303_carrier_raised(struct usb_serial_port *port) { struct pl2303_private *priv = usb_get_serial_port_data(port); if (priv->line_status & UART_DCD) return 1; return 0; } static int pl2303_set_break(struct usb_serial_port *port, bool enable) { struct usb_serial *serial = port->serial; u16 state; int result; if (enable) state = BREAK_ON; else state = BREAK_OFF; dev_dbg(&port->dev, "%s - turning break %s\n", __func__, state == BREAK_OFF ? "off" : "on"); result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), BREAK_REQUEST, BREAK_REQUEST_TYPE, state, 0, NULL, 0, 100); if (result) { dev_err(&port->dev, "error sending break = %d\n", result); return result; } return 0; } static int pl2303_break_ctl(struct tty_struct *tty, int state) { struct usb_serial_port *port = tty->driver_data; return pl2303_set_break(port, state); } static void pl2303_update_line_status(struct usb_serial_port *port, unsigned char *data, unsigned int actual_length) { struct usb_serial *serial = port->serial; struct pl2303_serial_private *spriv = usb_get_serial_data(serial); struct pl2303_private *priv = usb_get_serial_port_data(port); struct tty_struct *tty; unsigned long flags; unsigned int status_idx = UART_STATE_INDEX; u8 status; u8 delta; if (spriv->quirks & PL2303_QUIRK_UART_STATE_IDX0) status_idx = 0; if (actual_length < status_idx + 1) return; status = data[status_idx]; /* Save off the uart status for others to look at */ spin_lock_irqsave(&priv->lock, flags); delta = priv->line_status ^ status; priv->line_status = status; spin_unlock_irqrestore(&priv->lock, flags); if (status & UART_BREAK_ERROR) usb_serial_handle_break(port); if (delta & UART_STATE_MSR_MASK) { if (delta & UART_CTS) port->icount.cts++; if (delta & UART_DSR) port->icount.dsr++; if (delta & UART_RING) port->icount.rng++; if (delta & UART_DCD) { port->icount.dcd++; tty = tty_port_tty_get(&port->port); if (tty) { usb_serial_handle_dcd_change(port, tty, status & UART_DCD); tty_kref_put(tty); } } wake_up_interruptible(&port->port.delta_msr_wait); } } static void pl2303_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; unsigned int actual_length = urb->actual_length; int status = urb->status; int retval; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&port->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&port->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } usb_serial_debug_data(&port->dev, __func__, urb->actual_length, urb->transfer_buffer); pl2303_update_line_status(port, data, actual_length); exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) { dev_err(&port->dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } } static void pl2303_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; struct pl2303_private *priv = usb_get_serial_port_data(port); unsigned char *data = urb->transfer_buffer; char tty_flag = TTY_NORMAL; unsigned long flags; u8 line_status; int i; /* update line status */ spin_lock_irqsave(&priv->lock, flags); line_status = priv->line_status; priv->line_status &= ~UART_STATE_TRANSIENT_MASK; spin_unlock_irqrestore(&priv->lock, flags); if (!urb->actual_length) return; /* * Break takes precedence over parity, which takes precedence over * framing errors. */ if (line_status & UART_BREAK_ERROR) tty_flag = TTY_BREAK; else if (line_status & UART_PARITY_ERROR) tty_flag = TTY_PARITY; else if (line_status & UART_FRAME_ERROR) tty_flag = TTY_FRAME; if (tty_flag != TTY_NORMAL) dev_dbg(&port->dev, "%s - tty_flag = %d\n", __func__, tty_flag); /* overrun is special, not associated with a char */ if (line_status & UART_OVERRUN_ERROR) tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); if (port->sysrq) { for (i = 0; i < urb->actual_length; ++i) if (!usb_serial_handle_sysrq_char(port, data[i])) tty_insert_flip_char(&port->port, data[i], tty_flag); } else { tty_insert_flip_string_fixed_flag(&port->port, data, tty_flag, urb->actual_length); } tty_flip_buffer_push(&port->port); } static struct usb_serial_driver pl2303_device = { .driver = { .owner = THIS_MODULE, .name = "pl2303", }, .id_table = id_table, .num_bulk_in = 1, .num_bulk_out = 1, .num_interrupt_in = 0, /* see pl2303_calc_num_ports */ .bulk_in_size = 256, .bulk_out_size = 256, .open = pl2303_open, .close = pl2303_close, .dtr_rts = pl2303_dtr_rts, .carrier_raised = pl2303_carrier_raised, .break_ctl = pl2303_break_ctl, .set_termios = pl2303_set_termios, .tiocmget = pl2303_tiocmget, .tiocmset = pl2303_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .process_read_urb = pl2303_process_read_urb, .read_int_callback = pl2303_read_int_callback, .probe = pl2303_probe, .calc_num_ports = pl2303_calc_num_ports, .attach = pl2303_startup, .release = pl2303_release, .port_probe = pl2303_port_probe, .port_remove = pl2303_port_remove, }; static struct usb_serial_driver * const serial_drivers[] = { &pl2303_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_DESCRIPTION("Prolific PL2303 USB to serial adaptor driver"); MODULE_LICENSE("GPL v2"); |
| 3 2 2 2 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 | // SPDX-License-Identifier: GPL-2.0-or-later /* * USB RedRat3 IR Transceiver rc-core driver * * Copyright (c) 2011 by Jarod Wilson <jarod@redhat.com> * based heavily on the work of Stephen Cox, with additional * help from RedRat Ltd. * * This driver began life based on an old version of the first-generation * lirc_mceusb driver from the lirc 0.7.2 distribution. It was then * significantly rewritten by Stephen Cox with the aid of RedRat Ltd's * Chris Dodge. * * The driver was then ported to rc-core and significantly rewritten again, * by Jarod, using the in-kernel mceusb driver as a guide, after an initial * port effort was started by Stephen. * * TODO LIST: * - fix lirc not showing repeats properly * -- * * The RedRat3 is a USB transceiver with both send & receive, * with 2 separate sensors available for receive to enable * both good long range reception for general use, and good * short range reception when required for learning a signal. * * http://www.redrat.co.uk/ * * It uses its own little protocol to communicate, the required * parts of which are embedded within this driver. * -- */ #include <asm/unaligned.h> #include <linux/device.h> #include <linux/leds.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/input.h> #include <media/rc-core.h> /* Driver Information */ #define DRIVER_AUTHOR "Jarod Wilson <jarod@redhat.com>" #define DRIVER_AUTHOR2 "The Dweller, Stephen Cox" #define DRIVER_DESC "RedRat3 USB IR Transceiver Driver" #define DRIVER_NAME "redrat3" /* bulk data transfer types */ #define RR3_ERROR 0x01 #define RR3_MOD_SIGNAL_IN 0x20 #define RR3_MOD_SIGNAL_OUT 0x21 /* Get the RR firmware version */ #define RR3_FW_VERSION 0xb1 #define RR3_FW_VERSION_LEN 64 /* Send encoded signal bulk-sent earlier*/ #define RR3_TX_SEND_SIGNAL 0xb3 #define RR3_SET_IR_PARAM 0xb7 #define RR3_GET_IR_PARAM 0xb8 /* Blink the red LED on the device */ #define RR3_BLINK_LED 0xb9 /* Read serial number of device */ #define RR3_READ_SER_NO 0xba #define RR3_SER_NO_LEN 4 /* Start capture with the RC receiver */ #define RR3_RC_DET_ENABLE 0xbb /* Stop capture with the RC receiver */ #define RR3_RC_DET_DISABLE 0xbc /* Start capture with the wideband receiver */ #define RR3_MODSIG_CAPTURE 0xb2 /* Return the status of RC detector capture */ #define RR3_RC_DET_STATUS 0xbd /* Reset redrat */ #define RR3_RESET 0xa0 /* Max number of lengths in the signal. */ #define RR3_IR_IO_MAX_LENGTHS 0x01 /* Periods to measure mod. freq. */ #define RR3_IR_IO_PERIODS_MF 0x02 /* Size of memory for main signal data */ #define RR3_IR_IO_SIG_MEM_SIZE 0x03 /* Delta value when measuring lengths */ #define RR3_IR_IO_LENGTH_FUZZ 0x04 /* Timeout for end of signal detection */ #define RR3_IR_IO_SIG_TIMEOUT 0x05 /* Minimum value for pause recognition. */ #define RR3_IR_IO_MIN_PAUSE 0x06 /* Clock freq. of EZ-USB chip */ #define RR3_CLK 24000000 /* Clock periods per timer count */ #define RR3_CLK_PER_COUNT 12 /* (RR3_CLK / RR3_CLK_PER_COUNT) */ #define RR3_CLK_CONV_FACTOR 2000000 /* USB bulk-in wideband IR data endpoint address */ #define RR3_WIDE_IN_EP_ADDR 0x81 /* USB bulk-in narrowband IR data endpoint address */ #define RR3_NARROW_IN_EP_ADDR 0x82 /* Size of the fixed-length portion of the signal */ #define RR3_DRIVER_MAXLENS 255 #define RR3_MAX_SIG_SIZE 512 #define RR3_TIME_UNIT 50 #define RR3_END_OF_SIGNAL 0x7f #define RR3_TX_TRAILER_LEN 2 #define RR3_RX_MIN_TIMEOUT 5 #define RR3_RX_MAX_TIMEOUT 2000 /* The 8051's CPUCS Register address */ #define RR3_CPUCS_REG_ADDR 0x7f92 #define USB_RR3USB_VENDOR_ID 0x112a #define USB_RR3USB_PRODUCT_ID 0x0001 #define USB_RR3IIUSB_PRODUCT_ID 0x0005 /* * The redrat3 encodes an IR signal as set of different lengths and a set * of indices into those lengths. This sets how much two lengths must * differ before they are considered distinct, the value is specified * in microseconds. * Default 5, value 0 to 127. */ static int length_fuzz = 5; module_param(length_fuzz, uint, 0644); MODULE_PARM_DESC(length_fuzz, "Length Fuzz (0-127)"); /* * When receiving a continuous ir stream (for example when a user is * holding a button down on a remote), this specifies the minimum size * of a space when the redrat3 sends a irdata packet to the host. Specified * in milliseconds. Default value 18ms. * The value can be between 2 and 30 inclusive. */ static int minimum_pause = 18; module_param(minimum_pause, uint, 0644); MODULE_PARM_DESC(minimum_pause, "Minimum Pause in ms (2-30)"); /* * The carrier frequency is measured during the first pulse of the IR * signal. The larger the number of periods used To measure, the more * accurate the result is likely to be, however some signals have short * initial pulses, so in some case it may be necessary to reduce this value. * Default 8, value 1 to 255. */ static int periods_measure_carrier = 8; module_param(periods_measure_carrier, uint, 0644); MODULE_PARM_DESC(periods_measure_carrier, "Number of Periods to Measure Carrier (1-255)"); struct redrat3_header { __be16 length; __be16 transfer_type; } __packed; /* sending and receiving irdata */ struct redrat3_irdata { struct redrat3_header header; __be32 pause; __be16 mod_freq_count; __be16 num_periods; __u8 max_lengths; __u8 no_lengths; __be16 max_sig_size; __be16 sig_size; __u8 no_repeats; __be16 lens[RR3_DRIVER_MAXLENS]; /* not aligned */ __u8 sigdata[RR3_MAX_SIG_SIZE]; } __packed; /* firmware errors */ struct redrat3_error { struct redrat3_header header; __be16 fw_error; } __packed; /* table of devices that work with this driver */ static const struct usb_device_id redrat3_dev_table[] = { /* Original version of the RedRat3 */ {USB_DEVICE(USB_RR3USB_VENDOR_ID, USB_RR3USB_PRODUCT_ID)}, /* Second Version/release of the RedRat3 - RetRat3-II */ {USB_DEVICE(USB_RR3USB_VENDOR_ID, USB_RR3IIUSB_PRODUCT_ID)}, {} /* Terminating entry */ }; /* Structure to hold all of our device specific stuff */ struct redrat3_dev { /* core device bits */ struct rc_dev *rc; struct device *dev; /* led control */ struct led_classdev led; atomic_t flash; struct usb_ctrlrequest flash_control; struct urb *flash_urb; u8 flash_in_buf; /* learning */ bool wideband; struct usb_ctrlrequest learn_control; struct urb *learn_urb; u8 learn_buf; /* save off the usb device pointer */ struct usb_device *udev; /* the receive endpoint */ struct usb_endpoint_descriptor *ep_narrow; /* the buffer to receive data */ void *bulk_in_buf; /* urb used to read ir data */ struct urb *narrow_urb; struct urb *wide_urb; /* the send endpoint */ struct usb_endpoint_descriptor *ep_out; /* usb dma */ dma_addr_t dma_in; /* Is the device currently transmitting?*/ bool transmitting; /* store for current packet */ struct redrat3_irdata irdata; u16 bytes_read; u32 carrier; char name[64]; char phys[64]; }; static void redrat3_dump_fw_error(struct redrat3_dev *rr3, int code) { if (!rr3->transmitting && (code != 0x40)) dev_info(rr3->dev, "fw error code 0x%02x: ", code); switch (code) { case 0x00: pr_cont("No Error\n"); break; /* Codes 0x20 through 0x2f are IR Firmware Errors */ case 0x20: pr_cont("Initial signal pulse not long enough to measure carrier frequency\n"); break; case 0x21: pr_cont("Not enough length values allocated for signal\n"); break; case 0x22: pr_cont("Not enough memory allocated for signal data\n"); break; case 0x23: pr_cont("Too many signal repeats\n"); break; case 0x28: pr_cont("Insufficient memory available for IR signal data memory allocation\n"); break; case 0x29: pr_cont("Insufficient memory available for IrDa signal data memory allocation\n"); break; /* Codes 0x30 through 0x3f are USB Firmware Errors */ case 0x30: pr_cont("Insufficient memory available for bulk transfer structure\n"); break; /* * Other error codes... These are primarily errors that can occur in * the control messages sent to the redrat */ case 0x40: if (!rr3->transmitting) pr_cont("Signal capture has been terminated\n"); break; case 0x41: pr_cont("Attempt to set/get and unknown signal I/O algorithm parameter\n"); break; case 0x42: pr_cont("Signal capture already started\n"); break; default: pr_cont("Unknown Error\n"); break; } } static u32 redrat3_val_to_mod_freq(struct redrat3_irdata *irdata) { u32 mod_freq = 0; u16 mod_freq_count = be16_to_cpu(irdata->mod_freq_count); if (mod_freq_count != 0) mod_freq = (RR3_CLK * be16_to_cpu(irdata->num_periods)) / (mod_freq_count * RR3_CLK_PER_COUNT); return mod_freq; } /* this function scales down the figures for the same result... */ static u32 redrat3_len_to_us(u32 length) { u32 biglen = length * 1000; u32 divisor = (RR3_CLK_CONV_FACTOR) / 1000; u32 result = (u32) (biglen / divisor); /* don't allow zero lengths to go back, breaks lirc */ return result ? result : 1; } /* * convert us back into redrat3 lengths * * length * 1000 length * 1000000 * ------------- = ---------------- = micro * rr3clk / 1000 rr3clk * 6 * 2 4 * 3 micro * rr3clk micro * rr3clk / 1000 * ----- = 4 ----- = 6 -------------- = len --------------------- * 3 2 1000000 1000 */ static u32 redrat3_us_to_len(u32 microsec) { u32 result; u32 divisor; microsec = (microsec > IR_MAX_DURATION) ? IR_MAX_DURATION : microsec; divisor = (RR3_CLK_CONV_FACTOR / 1000); result = (u32)(microsec * divisor) / 1000; /* don't allow zero lengths to go back, breaks lirc */ return result ? result : 1; } static void redrat3_process_ir_data(struct redrat3_dev *rr3) { struct ir_raw_event rawir = {}; struct device *dev; unsigned int i, sig_size, offset, val; u32 mod_freq; dev = rr3->dev; mod_freq = redrat3_val_to_mod_freq(&rr3->irdata); dev_dbg(dev, "Got mod_freq of %u\n", mod_freq); if (mod_freq && rr3->wideband) { struct ir_raw_event ev = { .carrier_report = 1, .carrier = mod_freq }; ir_raw_event_store(rr3->rc, &ev); } /* process each rr3 encoded byte into an int */ sig_size = be16_to_cpu(rr3->irdata.sig_size); for (i = 0; i < sig_size; i++) { offset = rr3->irdata.sigdata[i]; val = get_unaligned_be16(&rr3->irdata.lens[offset]); /* we should always get pulse/space/pulse/space samples */ if (i % 2) rawir.pulse = false; else rawir.pulse = true; rawir.duration = redrat3_len_to_us(val); /* cap the value to IR_MAX_DURATION */ rawir.duration = (rawir.duration > IR_MAX_DURATION) ? IR_MAX_DURATION : rawir.duration; dev_dbg(dev, "storing %s with duration %d (i: %d)\n", rawir.pulse ? "pulse" : "space", rawir.duration, i); ir_raw_event_store_with_filter(rr3->rc, &rawir); } /* add a trailing space */ rawir.pulse = false; rawir.timeout = true; rawir.duration = rr3->rc->timeout; dev_dbg(dev, "storing trailing timeout with duration %d\n", rawir.duration); ir_raw_event_store_with_filter(rr3->rc, &rawir); dev_dbg(dev, "calling ir_raw_event_handle\n"); ir_raw_event_handle(rr3->rc); } /* Util fn to send rr3 cmds */ static int redrat3_send_cmd(int cmd, struct redrat3_dev *rr3) { struct usb_device *udev; u8 *data; int res; data = kzalloc(sizeof(u8), GFP_KERNEL); if (!data) return -ENOMEM; udev = rr3->udev; res = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0000, 0x0000, data, sizeof(u8), 10000); if (res < 0) { dev_err(rr3->dev, "%s: Error sending rr3 cmd res %d, data %d", __func__, res, *data); res = -EIO; } else res = data[0]; kfree(data); return res; } /* Enables the long range detector and starts async receive */ static int redrat3_enable_detector(struct redrat3_dev *rr3) { struct device *dev = rr3->dev; u8 ret; ret = redrat3_send_cmd(RR3_RC_DET_ENABLE, rr3); if (ret != 0) dev_dbg(dev, "%s: unexpected ret of %d\n", __func__, ret); ret = redrat3_send_cmd(RR3_RC_DET_STATUS, rr3); if (ret != 1) { dev_err(dev, "%s: detector status: %d, should be 1\n", __func__, ret); return -EIO; } ret = usb_submit_urb(rr3->narrow_urb, GFP_KERNEL); if (ret) { dev_err(rr3->dev, "narrow band urb failed: %d", ret); return ret; } ret = usb_submit_urb(rr3->wide_urb, GFP_KERNEL); if (ret) dev_err(rr3->dev, "wide band urb failed: %d", ret); return ret; } static inline void redrat3_delete(struct redrat3_dev *rr3, struct usb_device *udev) { usb_kill_urb(rr3->narrow_urb); usb_kill_urb(rr3->wide_urb); usb_kill_urb(rr3->flash_urb); usb_kill_urb(rr3->learn_urb); usb_free_urb(rr3->narrow_urb); usb_free_urb(rr3->wide_urb); usb_free_urb(rr3->flash_urb); usb_free_urb(rr3->learn_urb); usb_free_coherent(udev, le16_to_cpu(rr3->ep_narrow->wMaxPacketSize), rr3->bulk_in_buf, rr3->dma_in); kfree(rr3); } static u32 redrat3_get_timeout(struct redrat3_dev *rr3) { __be32 *tmp; u32 timeout = MS_TO_US(150); /* a sane default, if things go haywire */ int len, ret, pipe; len = sizeof(*tmp); tmp = kzalloc(len, GFP_KERNEL); if (!tmp) return timeout; pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_GET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, RR3_IR_IO_SIG_TIMEOUT, 0, tmp, len, 5000); if (ret != len) dev_warn(rr3->dev, "Failed to read timeout from hardware\n"); else { timeout = redrat3_len_to_us(be32_to_cpup(tmp)); dev_dbg(rr3->dev, "Got timeout of %d ms\n", timeout / 1000); } kfree(tmp); return timeout; } static int redrat3_set_timeout(struct rc_dev *rc_dev, unsigned int timeoutus) { struct redrat3_dev *rr3 = rc_dev->priv; struct usb_device *udev = rr3->udev; struct device *dev = rr3->dev; __be32 *timeout; int ret; timeout = kmalloc(sizeof(*timeout), GFP_KERNEL); if (!timeout) return -ENOMEM; *timeout = cpu_to_be32(redrat3_us_to_len(timeoutus)); ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_SIG_TIMEOUT, 0, timeout, sizeof(*timeout), 25000); dev_dbg(dev, "set ir parm timeout %d ret 0x%02x\n", be32_to_cpu(*timeout), ret); if (ret == sizeof(*timeout)) ret = 0; else if (ret >= 0) ret = -EIO; kfree(timeout); return ret; } static void redrat3_reset(struct redrat3_dev *rr3) { struct usb_device *udev = rr3->udev; struct device *dev = rr3->dev; int rc, rxpipe, txpipe; u8 *val; size_t const len = sizeof(*val); rxpipe = usb_rcvctrlpipe(udev, 0); txpipe = usb_sndctrlpipe(udev, 0); val = kmalloc(len, GFP_KERNEL); if (!val) return; *val = 0x01; rc = usb_control_msg(udev, rxpipe, RR3_RESET, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, RR3_CPUCS_REG_ADDR, 0, val, len, 25000); dev_dbg(dev, "reset returned 0x%02x\n", rc); *val = length_fuzz; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_LENGTH_FUZZ, 0, val, len, 25000); dev_dbg(dev, "set ir parm len fuzz %d rc 0x%02x\n", *val, rc); *val = (65536 - (minimum_pause * 2000)) / 256; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_MIN_PAUSE, 0, val, len, 25000); dev_dbg(dev, "set ir parm min pause %d rc 0x%02x\n", *val, rc); *val = periods_measure_carrier; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_PERIODS_MF, 0, val, len, 25000); dev_dbg(dev, "set ir parm periods measure carrier %d rc 0x%02x", *val, rc); *val = RR3_DRIVER_MAXLENS; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_MAX_LENGTHS, 0, val, len, 25000); dev_dbg(dev, "set ir parm max lens %d rc 0x%02x\n", *val, rc); kfree(val); } static void redrat3_get_firmware_rev(struct redrat3_dev *rr3) { int rc; char *buffer; buffer = kcalloc(RR3_FW_VERSION_LEN + 1, sizeof(*buffer), GFP_KERNEL); if (!buffer) return; rc = usb_control_msg(rr3->udev, usb_rcvctrlpipe(rr3->udev, 0), RR3_FW_VERSION, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0, 0, buffer, RR3_FW_VERSION_LEN, 5000); if (rc >= 0) dev_info(rr3->dev, "Firmware rev: %s", buffer); else dev_err(rr3->dev, "Problem fetching firmware ID\n"); kfree(buffer); } static void redrat3_read_packet_start(struct redrat3_dev *rr3, unsigned len) { struct redrat3_header *header = rr3->bulk_in_buf; unsigned pktlen, pkttype; /* grab the Length and type of transfer */ pktlen = be16_to_cpu(header->length); pkttype = be16_to_cpu(header->transfer_type); if (pktlen > sizeof(rr3->irdata)) { dev_warn(rr3->dev, "packet length %u too large\n", pktlen); return; } switch (pkttype) { case RR3_ERROR: if (len >= sizeof(struct redrat3_error)) { struct redrat3_error *error = rr3->bulk_in_buf; unsigned fw_error = be16_to_cpu(error->fw_error); redrat3_dump_fw_error(rr3, fw_error); } break; case RR3_MOD_SIGNAL_IN: memcpy(&rr3->irdata, rr3->bulk_in_buf, len); rr3->bytes_read = len; dev_dbg(rr3->dev, "bytes_read %d, pktlen %d\n", rr3->bytes_read, pktlen); break; default: dev_dbg(rr3->dev, "ignoring packet with type 0x%02x, len of %d, 0x%02x\n", pkttype, len, pktlen); break; } } static void redrat3_read_packet_continue(struct redrat3_dev *rr3, unsigned len) { void *irdata = &rr3->irdata; if (len + rr3->bytes_read > sizeof(rr3->irdata)) { dev_warn(rr3->dev, "too much data for packet\n"); rr3->bytes_read = 0; return; } memcpy(irdata + rr3->bytes_read, rr3->bulk_in_buf, len); rr3->bytes_read += len; dev_dbg(rr3->dev, "bytes_read %d, pktlen %d\n", rr3->bytes_read, be16_to_cpu(rr3->irdata.header.length)); } /* gather IR data from incoming urb, process it when we have enough */ static int redrat3_get_ir_data(struct redrat3_dev *rr3, unsigned len) { struct device *dev = rr3->dev; unsigned pkttype; int ret = 0; if (rr3->bytes_read == 0 && len >= sizeof(struct redrat3_header)) { redrat3_read_packet_start(rr3, len); } else if (rr3->bytes_read != 0) { redrat3_read_packet_continue(rr3, len); } else if (rr3->bytes_read == 0) { dev_err(dev, "error: no packet data read\n"); ret = -ENODATA; goto out; } if (rr3->bytes_read < be16_to_cpu(rr3->irdata.header.length) + sizeof(struct redrat3_header)) /* we're still accumulating data */ return 0; /* if we get here, we've got IR data to decode */ pkttype = be16_to_cpu(rr3->irdata.header.transfer_type); if (pkttype == RR3_MOD_SIGNAL_IN) redrat3_process_ir_data(rr3); else dev_dbg(dev, "discarding non-signal data packet (type 0x%02x)\n", pkttype); out: rr3->bytes_read = 0; return ret; } /* callback function from USB when async USB request has completed */ static void redrat3_handle_async(struct urb *urb) { struct redrat3_dev *rr3 = urb->context; int ret; switch (urb->status) { case 0: ret = redrat3_get_ir_data(rr3, urb->actual_length); if (!ret && rr3->wideband && !rr3->learn_urb->hcpriv) { ret = usb_submit_urb(rr3->learn_urb, GFP_ATOMIC); if (ret) dev_err(rr3->dev, "Failed to submit learning urb: %d", ret); } if (!ret) { /* no error, prepare to read more */ ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret) dev_err(rr3->dev, "Failed to resubmit urb: %d", ret); } break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: usb_unlink_urb(urb); return; case -EPIPE: default: dev_warn(rr3->dev, "Error: urb status = %d\n", urb->status); rr3->bytes_read = 0; break; } } static u16 mod_freq_to_val(unsigned int mod_freq) { int mult = 6000000; /* Clk used in mod. freq. generation is CLK24/4. */ return 65536 - (mult / mod_freq); } static int redrat3_set_tx_carrier(struct rc_dev *rcdev, u32 carrier) { struct redrat3_dev *rr3 = rcdev->priv; struct device *dev = rr3->dev; dev_dbg(dev, "Setting modulation frequency to %u", carrier); if (carrier == 0) return -EINVAL; rr3->carrier = carrier; return 0; } static int redrat3_transmit_ir(struct rc_dev *rcdev, unsigned *txbuf, unsigned count) { struct redrat3_dev *rr3 = rcdev->priv; struct device *dev = rr3->dev; struct redrat3_irdata *irdata = NULL; int ret, ret_len; int lencheck, cur_sample_len, pipe; int *sample_lens = NULL; u8 curlencheck = 0; unsigned i, sendbuf_len; if (rr3->transmitting) { dev_warn(dev, "%s: transmitter already in use\n", __func__); return -EAGAIN; } if (count > RR3_MAX_SIG_SIZE - RR3_TX_TRAILER_LEN) return -EINVAL; /* rr3 will disable rc detector on transmit */ rr3->transmitting = true; sample_lens = kcalloc(RR3_DRIVER_MAXLENS, sizeof(*sample_lens), GFP_KERNEL); if (!sample_lens) return -ENOMEM; irdata = kzalloc(sizeof(*irdata), GFP_KERNEL); if (!irdata) { ret = -ENOMEM; goto out; } for (i = 0; i < count; i++) { cur_sample_len = redrat3_us_to_len(txbuf[i]); if (cur_sample_len > 0xffff) { dev_warn(dev, "transmit period of %uus truncated to %uus\n", txbuf[i], redrat3_len_to_us(0xffff)); cur_sample_len = 0xffff; } for (lencheck = 0; lencheck < curlencheck; lencheck++) { if (sample_lens[lencheck] == cur_sample_len) break; } if (lencheck == curlencheck) { dev_dbg(dev, "txbuf[%d]=%u, pos %d, enc %u\n", i, txbuf[i], curlencheck, cur_sample_len); if (curlencheck < RR3_DRIVER_MAXLENS) { /* now convert the value to a proper * rr3 value.. */ sample_lens[curlencheck] = cur_sample_len; put_unaligned_be16(cur_sample_len, &irdata->lens[curlencheck]); curlencheck++; } else { ret = -EINVAL; goto out; } } irdata->sigdata[i] = lencheck; } irdata->sigdata[count] = RR3_END_OF_SIGNAL; irdata->sigdata[count + 1] = RR3_END_OF_SIGNAL; sendbuf_len = offsetof(struct redrat3_irdata, sigdata[count + RR3_TX_TRAILER_LEN]); /* fill in our packet header */ irdata->header.length = cpu_to_be16(sendbuf_len - sizeof(struct redrat3_header)); irdata->header.transfer_type = cpu_to_be16(RR3_MOD_SIGNAL_OUT); irdata->pause = cpu_to_be32(redrat3_len_to_us(100)); irdata->mod_freq_count = cpu_to_be16(mod_freq_to_val(rr3->carrier)); irdata->no_lengths = curlencheck; irdata->sig_size = cpu_to_be16(count + RR3_TX_TRAILER_LEN); pipe = usb_sndbulkpipe(rr3->udev, rr3->ep_out->bEndpointAddress); ret = usb_bulk_msg(rr3->udev, pipe, irdata, sendbuf_len, &ret_len, 10000); dev_dbg(dev, "sent %d bytes, (ret %d)\n", ret_len, ret); /* now tell the hardware to transmit what we sent it */ pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_TX_SEND_SIGNAL, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0, 0, irdata, 2, 10000); if (ret < 0) dev_err(dev, "Error: control msg send failed, rc %d\n", ret); else ret = count; out: kfree(irdata); kfree(sample_lens); rr3->transmitting = false; /* rr3 re-enables rc detector because it was enabled before */ return ret; } static void redrat3_brightness_set(struct led_classdev *led_dev, enum led_brightness brightness) { struct redrat3_dev *rr3 = container_of(led_dev, struct redrat3_dev, led); if (brightness != LED_OFF && atomic_cmpxchg(&rr3->flash, 0, 1) == 0) { int ret = usb_submit_urb(rr3->flash_urb, GFP_ATOMIC); if (ret != 0) { dev_dbg(rr3->dev, "%s: unexpected ret of %d\n", __func__, ret); atomic_set(&rr3->flash, 0); } } } static int redrat3_wideband_receiver(struct rc_dev *rcdev, int enable) { struct redrat3_dev *rr3 = rcdev->priv; int ret = 0; rr3->wideband = enable != 0; if (enable) { ret = usb_submit_urb(rr3->learn_urb, GFP_KERNEL); if (ret) dev_err(rr3->dev, "Failed to submit learning urb: %d", ret); } return ret; } static void redrat3_learn_complete(struct urb *urb) { struct redrat3_dev *rr3 = urb->context; switch (urb->status) { case 0: break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: usb_unlink_urb(urb); return; case -EPIPE: default: dev_err(rr3->dev, "Error: learn urb status = %d", urb->status); break; } } static void redrat3_led_complete(struct urb *urb) { struct redrat3_dev *rr3 = urb->context; switch (urb->status) { case 0: break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: usb_unlink_urb(urb); return; case -EPIPE: default: dev_dbg(rr3->dev, "Error: urb status = %d\n", urb->status); break; } rr3->led.brightness = LED_OFF; atomic_dec(&rr3->flash); } static struct rc_dev *redrat3_init_rc_dev(struct redrat3_dev *rr3) { struct device *dev = rr3->dev; struct rc_dev *rc; int ret; u16 prod = le16_to_cpu(rr3->udev->descriptor.idProduct); rc = rc_allocate_device(RC_DRIVER_IR_RAW); if (!rc) return NULL; snprintf(rr3->name, sizeof(rr3->name), "RedRat3%s Infrared Remote Transceiver", prod == USB_RR3IIUSB_PRODUCT_ID ? "-II" : ""); usb_make_path(rr3->udev, rr3->phys, sizeof(rr3->phys)); rc->device_name = rr3->name; rc->input_phys = rr3->phys; usb_to_input_id(rr3->udev, &rc->input_id); rc->dev.parent = dev; rc->priv = rr3; rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; rc->min_timeout = MS_TO_US(RR3_RX_MIN_TIMEOUT); rc->max_timeout = MS_TO_US(RR3_RX_MAX_TIMEOUT); rc->timeout = redrat3_get_timeout(rr3); rc->s_timeout = redrat3_set_timeout; rc->tx_ir = redrat3_transmit_ir; rc->s_tx_carrier = redrat3_set_tx_carrier; rc->s_carrier_report = redrat3_wideband_receiver; rc->driver_name = DRIVER_NAME; rc->rx_resolution = 2; rc->map_name = RC_MAP_HAUPPAUGE; ret = rc_register_device(rc); if (ret < 0) { dev_err(dev, "remote dev registration failed\n"); goto out; } return rc; out: rc_free_device(rc); return NULL; } static int redrat3_dev_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct device *dev = &intf->dev; struct usb_host_interface *uhi; struct redrat3_dev *rr3; struct usb_endpoint_descriptor *ep; struct usb_endpoint_descriptor *ep_narrow = NULL; struct usb_endpoint_descriptor *ep_wide = NULL; struct usb_endpoint_descriptor *ep_out = NULL; u8 addr, attrs; int pipe, i; int retval = -ENOMEM; uhi = intf->cur_altsetting; /* find our bulk-in and bulk-out endpoints */ for (i = 0; i < uhi->desc.bNumEndpoints; ++i) { ep = &uhi->endpoint[i].desc; addr = ep->bEndpointAddress; attrs = ep->bmAttributes; if (((addr & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN) && ((attrs & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_BULK)) { dev_dbg(dev, "found bulk-in endpoint at 0x%02x\n", ep->bEndpointAddress); /* data comes in on 0x82, 0x81 is for learning */ if (ep->bEndpointAddress == RR3_NARROW_IN_EP_ADDR) ep_narrow = ep; if (ep->bEndpointAddress == RR3_WIDE_IN_EP_ADDR) ep_wide = ep; } if ((ep_out == NULL) && ((addr & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT) && ((attrs & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_BULK)) { dev_dbg(dev, "found bulk-out endpoint at 0x%02x\n", ep->bEndpointAddress); ep_out = ep; } } if (!ep_narrow || !ep_out || !ep_wide) { dev_err(dev, "Couldn't find all endpoints\n"); retval = -ENODEV; goto no_endpoints; } /* allocate memory for our device state and initialize it */ rr3 = kzalloc(sizeof(*rr3), GFP_KERNEL); if (!rr3) goto no_endpoints; rr3->dev = &intf->dev; rr3->ep_narrow = ep_narrow; rr3->ep_out = ep_out; rr3->udev = udev; /* set up bulk-in endpoint */ rr3->narrow_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->narrow_urb) goto redrat_free; rr3->wide_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->wide_urb) goto redrat_free; rr3->bulk_in_buf = usb_alloc_coherent(udev, le16_to_cpu(ep_narrow->wMaxPacketSize), GFP_KERNEL, &rr3->dma_in); if (!rr3->bulk_in_buf) goto redrat_free; pipe = usb_rcvbulkpipe(udev, ep_narrow->bEndpointAddress); usb_fill_bulk_urb(rr3->narrow_urb, udev, pipe, rr3->bulk_in_buf, le16_to_cpu(ep_narrow->wMaxPacketSize), redrat3_handle_async, rr3); rr3->narrow_urb->transfer_dma = rr3->dma_in; rr3->narrow_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; pipe = usb_rcvbulkpipe(udev, ep_wide->bEndpointAddress); usb_fill_bulk_urb(rr3->wide_urb, udev, pipe, rr3->bulk_in_buf, le16_to_cpu(ep_narrow->wMaxPacketSize), redrat3_handle_async, rr3); rr3->wide_urb->transfer_dma = rr3->dma_in; rr3->wide_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; redrat3_reset(rr3); redrat3_get_firmware_rev(rr3); /* default.. will get overridden by any sends with a freq defined */ rr3->carrier = 38000; atomic_set(&rr3->flash, 0); rr3->flash_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->flash_urb) goto redrat_free; /* learn urb */ rr3->learn_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->learn_urb) goto redrat_free; /* setup packet is 'c0 b2 0000 0000 0001' */ rr3->learn_control.bRequestType = 0xc0; rr3->learn_control.bRequest = RR3_MODSIG_CAPTURE; rr3->learn_control.wLength = cpu_to_le16(1); usb_fill_control_urb(rr3->learn_urb, udev, usb_rcvctrlpipe(udev, 0), (unsigned char *)&rr3->learn_control, &rr3->learn_buf, sizeof(rr3->learn_buf), redrat3_learn_complete, rr3); /* setup packet is 'c0 b9 0000 0000 0001' */ rr3->flash_control.bRequestType = 0xc0; rr3->flash_control.bRequest = RR3_BLINK_LED; rr3->flash_control.wLength = cpu_to_le16(1); usb_fill_control_urb(rr3->flash_urb, udev, usb_rcvctrlpipe(udev, 0), (unsigned char *)&rr3->flash_control, &rr3->flash_in_buf, sizeof(rr3->flash_in_buf), redrat3_led_complete, rr3); /* led control */ rr3->led.name = "redrat3:red:feedback"; rr3->led.default_trigger = "rc-feedback"; rr3->led.brightness_set = redrat3_brightness_set; retval = led_classdev_register(&intf->dev, &rr3->led); if (retval) goto redrat_free; rr3->rc = redrat3_init_rc_dev(rr3); if (!rr3->rc) { retval = -ENOMEM; goto led_free; } /* might be all we need to do? */ retval = redrat3_enable_detector(rr3); if (retval < 0) goto led_free; /* we can register the device now, as it is ready */ usb_set_intfdata(intf, rr3); return 0; led_free: led_classdev_unregister(&rr3->led); redrat_free: redrat3_delete(rr3, rr3->udev); no_endpoints: return retval; } static void redrat3_dev_disconnect(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct redrat3_dev *rr3 = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); rc_unregister_device(rr3->rc); led_classdev_unregister(&rr3->led); redrat3_delete(rr3, udev); } static int redrat3_dev_suspend(struct usb_interface *intf, pm_message_t message) { struct redrat3_dev *rr3 = usb_get_intfdata(intf); led_classdev_suspend(&rr3->led); usb_kill_urb(rr3->narrow_urb); usb_kill_urb(rr3->wide_urb); usb_kill_urb(rr3->flash_urb); return 0; } static int redrat3_dev_resume(struct usb_interface *intf) { struct redrat3_dev *rr3 = usb_get_intfdata(intf); if (usb_submit_urb(rr3->narrow_urb, GFP_NOIO)) return -EIO; if (usb_submit_urb(rr3->wide_urb, GFP_NOIO)) return -EIO; led_classdev_resume(&rr3->led); return 0; } static struct usb_driver redrat3_dev_driver = { .name = DRIVER_NAME, .probe = redrat3_dev_probe, .disconnect = redrat3_dev_disconnect, .suspend = redrat3_dev_suspend, .resume = redrat3_dev_resume, .reset_resume = redrat3_dev_resume, .id_table = redrat3_dev_table }; module_usb_driver(redrat3_dev_driver); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_AUTHOR(DRIVER_AUTHOR2); MODULE_LICENSE("GPL"); MODULE_DEVICE_TABLE(usb, redrat3_dev_table); |
| 132 132 107 108 122 122 120 122 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/l3mdev/l3mdev.c - L3 master device implementation * Copyright (c) 2015 Cumulus Networks * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/netdevice.h> #include <net/fib_rules.h> #include <net/l3mdev.h> static DEFINE_SPINLOCK(l3mdev_lock); struct l3mdev_handler { lookup_by_table_id_t dev_lookup; }; static struct l3mdev_handler l3mdev_handlers[L3MDEV_TYPE_MAX + 1]; static int l3mdev_check_type(enum l3mdev_type l3type) { if (l3type <= L3MDEV_TYPE_UNSPEC || l3type > L3MDEV_TYPE_MAX) return -EINVAL; return 0; } int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn) { struct l3mdev_handler *hdlr; int res; res = l3mdev_check_type(l3type); if (res) return res; hdlr = &l3mdev_handlers[l3type]; spin_lock(&l3mdev_lock); if (hdlr->dev_lookup) { res = -EBUSY; goto unlock; } hdlr->dev_lookup = fn; res = 0; unlock: spin_unlock(&l3mdev_lock); return res; } EXPORT_SYMBOL_GPL(l3mdev_table_lookup_register); void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn) { struct l3mdev_handler *hdlr; if (l3mdev_check_type(l3type)) return; hdlr = &l3mdev_handlers[l3type]; spin_lock(&l3mdev_lock); if (hdlr->dev_lookup == fn) hdlr->dev_lookup = NULL; spin_unlock(&l3mdev_lock); } EXPORT_SYMBOL_GPL(l3mdev_table_lookup_unregister); int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id) { lookup_by_table_id_t lookup; struct l3mdev_handler *hdlr; int ifindex = -EINVAL; int res; res = l3mdev_check_type(l3type); if (res) return res; hdlr = &l3mdev_handlers[l3type]; spin_lock(&l3mdev_lock); lookup = hdlr->dev_lookup; if (!lookup) goto unlock; ifindex = lookup(net, table_id); unlock: spin_unlock(&l3mdev_lock); return ifindex; } EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id); /** * l3mdev_master_ifindex_rcu - get index of L3 master device * @dev: targeted interface */ int l3mdev_master_ifindex_rcu(const struct net_device *dev) { int ifindex = 0; if (!dev) return 0; if (netif_is_l3_master(dev)) { ifindex = dev->ifindex; } else if (netif_is_l3_slave(dev)) { struct net_device *master; struct net_device *_dev = (struct net_device *)dev; /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. * list_first_or_null_rcu does not handle a const arg. We aren't * making changes, just want the master device from that list so * typecast to remove the const */ master = netdev_master_upper_dev_get_rcu(_dev); if (master) ifindex = master->ifindex; } return ifindex; } EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** * l3mdev_master_upper_ifindex_by_index_rcu - get index of upper l3 master * device * @net: network namespace for device index lookup * @ifindex: targeted interface */ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; dev = dev_get_by_index_rcu(net, ifindex); while (dev && !netif_is_l3_master(dev)) dev = netdev_master_upper_dev_get_rcu(dev); return dev ? dev->ifindex : 0; } EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); /** * l3mdev_fib_table_rcu - get FIB table id associated with an L3 * master interface * @dev: targeted interface */ u32 l3mdev_fib_table_rcu(const struct net_device *dev) { u32 tb_id = 0; if (!dev) return 0; if (netif_is_l3_master(dev)) { if (dev->l3mdev_ops->l3mdev_fib_table) tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev); } else if (netif_is_l3_slave(dev)) { /* Users of netdev_master_upper_dev_get_rcu need non-const, * but current inet_*type functions take a const */ struct net_device *_dev = (struct net_device *) dev; const struct net_device *master; master = netdev_master_upper_dev_get_rcu(_dev); if (master && master->l3mdev_ops->l3mdev_fib_table) tb_id = master->l3mdev_ops->l3mdev_fib_table(master); } return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) { struct net_device *dev; u32 tb_id = 0; if (!ifindex) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) tb_id = l3mdev_fib_table_rcu(dev); rcu_read_unlock(); return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); /** * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup * This function does not hold refcnt on the returned dst. * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); } return dst; } EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device * @net: network namespace for device index lookup * @fl: flow struct * @arg: store the table the rule matched with here */ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) { struct net_device *dev; int rc = 0; /* update flow ensures flowi_l3mdev is set when relevant */ if (!fl->flowi_l3mdev) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, fl->flowi_l3mdev); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_fib_table) { arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); rc = 1; } rcu_read_unlock(); return rc; } void l3mdev_update_flow(struct net *net, struct flowi *fl) { struct net_device *dev; rcu_read_lock(); if (fl->flowi_oif) { dev = dev_get_by_index_rcu(net, fl->flowi_oif); if (dev) { if (!fl->flowi_l3mdev) fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); /* oif set to L3mdev directs lookup to its table; * reset to avoid oif match in fib_lookup */ if (netif_is_l3_master(dev)) fl->flowi_oif = 0; goto out; } } if (fl->flowi_iif > LOOPBACK_IFINDEX && !fl->flowi_l3mdev) { dev = dev_get_by_index_rcu(net, fl->flowi_iif); if (dev) fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); } out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(l3mdev_update_flow); |
| 12 12 12 12 12 12 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2014 Protonic Holland, * David Jander * Copyright (C) 2014-2021, 2023 Pengutronix, * Marc Kleine-Budde <kernel@pengutronix.de> */ #include <linux/can/dev.h> #include <linux/can/rx-offload.h> struct can_rx_offload_cb { u32 timestamp; }; static inline struct can_rx_offload_cb * can_rx_offload_get_cb(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct can_rx_offload_cb) > sizeof(skb->cb)); return (struct can_rx_offload_cb *)skb->cb; } static inline bool can_rx_offload_le(struct can_rx_offload *offload, unsigned int a, unsigned int b) { if (offload->inc) return a <= b; else return a >= b; } static inline unsigned int can_rx_offload_inc(struct can_rx_offload *offload, unsigned int *val) { if (offload->inc) return (*val)++; else return (*val)--; } static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota) { struct can_rx_offload *offload = container_of(napi, struct can_rx_offload, napi); struct net_device *dev = offload->dev; struct net_device_stats *stats = &dev->stats; struct sk_buff *skb; int work_done = 0; while ((work_done < quota) && (skb = skb_dequeue(&offload->skb_queue))) { struct can_frame *cf = (struct can_frame *)skb->data; work_done++; if (!(cf->can_id & CAN_ERR_FLAG)) { stats->rx_packets++; if (!(cf->can_id & CAN_RTR_FLAG)) stats->rx_bytes += cf->len; } netif_receive_skb(skb); } if (work_done < quota) { napi_complete_done(napi, work_done); /* Check if there was another interrupt */ if (!skb_queue_empty(&offload->skb_queue)) napi_reschedule(&offload->napi); } return work_done; } static inline void __skb_queue_add_sort(struct sk_buff_head *head, struct sk_buff *new, int (*compare)(struct sk_buff *a, struct sk_buff *b)) { struct sk_buff *pos, *insert = NULL; skb_queue_reverse_walk(head, pos) { const struct can_rx_offload_cb *cb_pos, *cb_new; cb_pos = can_rx_offload_get_cb(pos); cb_new = can_rx_offload_get_cb(new); netdev_dbg(new->dev, "%s: pos=0x%08x, new=0x%08x, diff=%10d, queue_len=%d\n", __func__, cb_pos->timestamp, cb_new->timestamp, cb_new->timestamp - cb_pos->timestamp, skb_queue_len(head)); if (compare(pos, new) < 0) continue; insert = pos; break; } if (!insert) __skb_queue_head(head, new); else __skb_queue_after(head, insert, new); } static int can_rx_offload_compare(struct sk_buff *a, struct sk_buff *b) { const struct can_rx_offload_cb *cb_a, *cb_b; cb_a = can_rx_offload_get_cb(a); cb_b = can_rx_offload_get_cb(b); /* Subtract two u32 and return result as int, to keep * difference steady around the u32 overflow. */ return cb_b->timestamp - cb_a->timestamp; } /** * can_rx_offload_offload_one() - Read one CAN frame from HW * @offload: pointer to rx_offload context * @n: number of mailbox to read * * The task of this function is to read a CAN frame from mailbox @n * from the device and return the mailbox's content as a struct * sk_buff. * * If the struct can_rx_offload::skb_queue exceeds the maximal queue * length (struct can_rx_offload::skb_queue_len_max) or no skb can be * allocated, the mailbox contents is discarded by reading it into an * overflow buffer. This way the mailbox is marked as free by the * driver. * * Return: A pointer to skb containing the CAN frame on success. * * NULL if the mailbox @n is empty. * * ERR_PTR() in case of an error */ static struct sk_buff * can_rx_offload_offload_one(struct can_rx_offload *offload, unsigned int n) { struct sk_buff *skb; struct can_rx_offload_cb *cb; bool drop = false; u32 timestamp; /* If queue is full drop frame */ if (unlikely(skb_queue_len(&offload->skb_queue) > offload->skb_queue_len_max)) drop = true; skb = offload->mailbox_read(offload, n, ×tamp, drop); /* Mailbox was empty. */ if (unlikely(!skb)) return NULL; /* There was a problem reading the mailbox, propagate * error value. */ if (IS_ERR(skb)) { offload->dev->stats.rx_dropped++; offload->dev->stats.rx_fifo_errors++; return skb; } /* Mailbox was read. */ cb = can_rx_offload_get_cb(skb); cb->timestamp = timestamp; return skb; } int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 pending) { unsigned int i; int received = 0; for (i = offload->mb_first; can_rx_offload_le(offload, i, offload->mb_last); can_rx_offload_inc(offload, &i)) { struct sk_buff *skb; if (!(pending & BIT_ULL(i))) continue; skb = can_rx_offload_offload_one(offload, i); if (IS_ERR_OR_NULL(skb)) continue; __skb_queue_add_sort(&offload->skb_irq_queue, skb, can_rx_offload_compare); received++; } return received; } EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_timestamp); int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload) { struct sk_buff *skb; int received = 0; while (1) { skb = can_rx_offload_offload_one(offload, 0); if (IS_ERR(skb)) continue; if (!skb) break; __skb_queue_tail(&offload->skb_irq_queue, skb); received++; } return received; } EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_fifo); int can_rx_offload_queue_timestamp(struct can_rx_offload *offload, struct sk_buff *skb, u32 timestamp) { struct can_rx_offload_cb *cb; if (skb_queue_len(&offload->skb_queue) > offload->skb_queue_len_max) { dev_kfree_skb_any(skb); return -ENOBUFS; } cb = can_rx_offload_get_cb(skb); cb->timestamp = timestamp; __skb_queue_add_sort(&offload->skb_irq_queue, skb, can_rx_offload_compare); return 0; } EXPORT_SYMBOL_GPL(can_rx_offload_queue_timestamp); unsigned int can_rx_offload_get_echo_skb_queue_timestamp(struct can_rx_offload *offload, unsigned int idx, u32 timestamp, unsigned int *frame_len_ptr) { struct net_device *dev = offload->dev; struct net_device_stats *stats = &dev->stats; struct sk_buff *skb; unsigned int len; int err; skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; err = can_rx_offload_queue_timestamp(offload, skb, timestamp); if (err) { stats->rx_errors++; stats->tx_fifo_errors++; } return len; } EXPORT_SYMBOL_GPL(can_rx_offload_get_echo_skb_queue_timestamp); int can_rx_offload_queue_tail(struct can_rx_offload *offload, struct sk_buff *skb) { if (skb_queue_len(&offload->skb_queue) > offload->skb_queue_len_max) { dev_kfree_skb_any(skb); return -ENOBUFS; } __skb_queue_tail(&offload->skb_irq_queue, skb); return 0; } EXPORT_SYMBOL_GPL(can_rx_offload_queue_tail); unsigned int can_rx_offload_get_echo_skb_queue_tail(struct can_rx_offload *offload, unsigned int idx, unsigned int *frame_len_ptr) { struct net_device *dev = offload->dev; struct net_device_stats *stats = &dev->stats; struct sk_buff *skb; unsigned int len; int err; skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; err = can_rx_offload_queue_tail(offload, skb); if (err) { stats->rx_errors++; stats->tx_fifo_errors++; } return len; } EXPORT_SYMBOL_GPL(can_rx_offload_get_echo_skb_queue_tail); void can_rx_offload_irq_finish(struct can_rx_offload *offload) { unsigned long flags; int queue_len; if (skb_queue_empty_lockless(&offload->skb_irq_queue)) return; spin_lock_irqsave(&offload->skb_queue.lock, flags); skb_queue_splice_tail_init(&offload->skb_irq_queue, &offload->skb_queue); spin_unlock_irqrestore(&offload->skb_queue.lock, flags); queue_len = skb_queue_len(&offload->skb_queue); if (queue_len > offload->skb_queue_len_max / 8) netdev_dbg(offload->dev, "%s: queue_len=%d\n", __func__, queue_len); napi_schedule(&offload->napi); } EXPORT_SYMBOL_GPL(can_rx_offload_irq_finish); void can_rx_offload_threaded_irq_finish(struct can_rx_offload *offload) { unsigned long flags; int queue_len; if (skb_queue_empty_lockless(&offload->skb_irq_queue)) return; spin_lock_irqsave(&offload->skb_queue.lock, flags); skb_queue_splice_tail_init(&offload->skb_irq_queue, &offload->skb_queue); spin_unlock_irqrestore(&offload->skb_queue.lock, flags); queue_len = skb_queue_len(&offload->skb_queue); if (queue_len > offload->skb_queue_len_max / 8) netdev_dbg(offload->dev, "%s: queue_len=%d\n", __func__, queue_len); local_bh_disable(); napi_schedule(&offload->napi); local_bh_enable(); } EXPORT_SYMBOL_GPL(can_rx_offload_threaded_irq_finish); static int can_rx_offload_init_queue(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight) { offload->dev = dev; /* Limit queue len to 4x the weight (rounded to next power of two) */ offload->skb_queue_len_max = 2 << fls(weight); offload->skb_queue_len_max *= 4; skb_queue_head_init(&offload->skb_queue); __skb_queue_head_init(&offload->skb_irq_queue); netif_napi_add_weight(dev, &offload->napi, can_rx_offload_napi_poll, weight); dev_dbg(dev->dev.parent, "%s: skb_queue_len_max=%d\n", __func__, offload->skb_queue_len_max); return 0; } int can_rx_offload_add_timestamp(struct net_device *dev, struct can_rx_offload *offload) { unsigned int weight; if (offload->mb_first > BITS_PER_LONG_LONG || offload->mb_last > BITS_PER_LONG_LONG || !offload->mailbox_read) return -EINVAL; if (offload->mb_first < offload->mb_last) { offload->inc = true; weight = offload->mb_last - offload->mb_first; } else { offload->inc = false; weight = offload->mb_first - offload->mb_last; } return can_rx_offload_init_queue(dev, offload, weight); } EXPORT_SYMBOL_GPL(can_rx_offload_add_timestamp); int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight) { if (!offload->mailbox_read) return -EINVAL; return can_rx_offload_init_queue(dev, offload, weight); } EXPORT_SYMBOL_GPL(can_rx_offload_add_fifo); int can_rx_offload_add_manual(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight) { if (offload->mailbox_read) return -EINVAL; return can_rx_offload_init_queue(dev, offload, weight); } EXPORT_SYMBOL_GPL(can_rx_offload_add_manual); void can_rx_offload_enable(struct can_rx_offload *offload) { napi_enable(&offload->napi); } EXPORT_SYMBOL_GPL(can_rx_offload_enable); void can_rx_offload_del(struct can_rx_offload *offload) { netif_napi_del(&offload->napi); skb_queue_purge(&offload->skb_queue); __skb_queue_purge(&offload->skb_irq_queue); } EXPORT_SYMBOL_GPL(can_rx_offload_del); |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 | // SPDX-License-Identifier: GPL-2.0+ /* * usbdux.c * Copyright (C) 2003-2014 Bernd Porr, mail@berndporr.me.uk */ /* * Driver: usbdux * Description: University of Stirling USB DAQ & INCITE Technology Limited * Devices: [ITL] USB-DUX (usbdux) * Author: Bernd Porr <mail@berndporr.me.uk> * Updated: 10 Oct 2014 * Status: Stable * * Connection scheme for the counter at the digital port: * 0=/CLK0, 1=UP/DOWN0, 2=RESET0, 4=/CLK1, 5=UP/DOWN1, 6=RESET1. * The sampling rate of the counter is approximately 500Hz. * * Note that under USB2.0 the length of the channel list determines * the max sampling rate. If you sample only one channel you get 8kHz * sampling rate. If you sample two channels you get 4kHz and so on. */ /* * I must give credit here to Chris Baugher who * wrote the driver for AT-MIO-16d. I used some parts of this * driver. I also must give credits to David Brownell * who supported me with the USB development. * * Bernd Porr * * * Revision history: * 0.94: D/A output should work now with any channel list combinations * 0.95: .owner commented out for kernel vers below 2.4.19 * sanity checks in ai/ao_cmd * 0.96: trying to get it working with 2.6, moved all memory alloc to comedi's * attach final USB IDs * moved memory allocation completely to the corresponding comedi * functions firmware upload is by fxload and no longer by comedi (due to * enumeration) * 0.97: USB IDs received, adjusted table * 0.98: SMP, locking, memory alloc: moved all usb memory alloc * to the usb subsystem and moved all comedi related memory * alloc to comedi. * | kernel | registration | usbdux-usb | usbdux-comedi | comedi | * 0.99: USB 2.0: changed protocol to isochronous transfer * IRQ transfer is too buggy and too risky in 2.0 * for the high speed ISO transfer is now a working version * available * 0.99b: Increased the iso transfer buffer for high sp.to 10 buffers. Some VIA * chipsets miss out IRQs. Deeper buffering is needed. * 1.00: full USB 2.0 support for the A/D converter. Now: max 8kHz sampling * rate. * Firmware vers 1.00 is needed for this. * Two 16 bit up/down/reset counter with a sampling rate of 1kHz * And loads of cleaning up, in particular streamlining the * bulk transfers. * 1.1: moved EP4 transfers to EP1 to make space for a PWM output on EP4 * 1.2: added PWM support via EP4 * 2.0: PWM seems to be stable and is not interfering with the other functions * 2.1: changed PWM API * 2.2: added firmware kernel request to fix an udev problem * 2.3: corrected a bug in bulk timeouts which were far too short * 2.4: fixed a bug which causes the driver to hang when it ran out of data. * Thanks to Jan-Matthias Braun and Ian to spot the bug and fix it. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/input.h> #include <linux/fcntl.h> #include <linux/compiler.h> #include <linux/comedi/comedi_usb.h> /* constants for firmware upload and download */ #define USBDUX_FIRMWARE "usbdux_firmware.bin" #define USBDUX_FIRMWARE_MAX_LEN 0x2000 #define USBDUX_FIRMWARE_CMD 0xa0 #define VENDOR_DIR_IN 0xc0 #define VENDOR_DIR_OUT 0x40 #define USBDUX_CPU_CS 0xe600 /* usbdux bulk transfer commands */ #define USBDUX_CMD_MULT_AI 0 #define USBDUX_CMD_AO 1 #define USBDUX_CMD_DIO_CFG 2 #define USBDUX_CMD_DIO_BITS 3 #define USBDUX_CMD_SINGLE_AI 4 #define USBDUX_CMD_TIMER_RD 5 #define USBDUX_CMD_TIMER_WR 6 #define USBDUX_CMD_PWM_ON 7 #define USBDUX_CMD_PWM_OFF 8 /* timeout for the USB-transfer in ms */ #define BULK_TIMEOUT 1000 /* 300Hz max frequ under PWM */ #define MIN_PWM_PERIOD ((long)(1E9 / 300)) /* Default PWM frequency */ #define PWM_DEFAULT_PERIOD ((long)(1E9 / 100)) /* Size of one A/D value */ #define SIZEADIN ((sizeof(u16))) /* * Size of the input-buffer IN BYTES * Always multiple of 8 for 8 microframes which is needed in the highspeed mode */ #define SIZEINBUF (8 * SIZEADIN) /* 16 bytes. */ #define SIZEINSNBUF 16 /* size of one value for the D/A converter: channel and value */ #define SIZEDAOUT ((sizeof(u8) + sizeof(u16))) /* * Size of the output-buffer in bytes * Actually only the first 4 triplets are used but for the * high speed mode we need to pad it to 8 (microframes). */ #define SIZEOUTBUF (8 * SIZEDAOUT) /* * Size of the buffer for the dux commands: just now max size is determined * by the analogue out + command byte + panic bytes... */ #define SIZEOFDUXBUFFER (8 * SIZEDAOUT + 2) /* Number of in-URBs which receive the data: min=2 */ #define NUMOFINBUFFERSFULL 5 /* Number of out-URBs which send the data: min=2 */ #define NUMOFOUTBUFFERSFULL 5 /* Number of in-URBs which receive the data: min=5 */ /* must have more buffers due to buggy USB ctr */ #define NUMOFINBUFFERSHIGH 10 /* Number of out-URBs which send the data: min=5 */ /* must have more buffers due to buggy USB ctr */ #define NUMOFOUTBUFFERSHIGH 10 /* number of retries to get the right dux command */ #define RETRIES 10 static const struct comedi_lrange range_usbdux_ai_range = { 4, { BIP_RANGE(4.096), BIP_RANGE(4.096 / 2), UNI_RANGE(4.096), UNI_RANGE(4.096 / 2) } }; static const struct comedi_lrange range_usbdux_ao_range = { 2, { BIP_RANGE(4.096), UNI_RANGE(4.096) } }; struct usbdux_private { /* actual number of in-buffers */ int n_ai_urbs; /* actual number of out-buffers */ int n_ao_urbs; /* ISO-transfer handling: buffers */ struct urb **ai_urbs; struct urb **ao_urbs; /* pwm-transfer handling */ struct urb *pwm_urb; /* PWM period */ unsigned int pwm_period; /* PWM internal delay for the GPIF in the FX2 */ u8 pwm_delay; /* size of the PWM buffer which holds the bit pattern */ int pwm_buf_sz; /* input buffer for the ISO-transfer */ __le16 *in_buf; /* input buffer for single insn */ __le16 *insn_buf; unsigned int high_speed:1; unsigned int ai_cmd_running:1; unsigned int ao_cmd_running:1; unsigned int pwm_cmd_running:1; /* time between samples in units of the timer */ unsigned int ai_timer; unsigned int ao_timer; /* counter between aquisitions */ unsigned int ai_counter; unsigned int ao_counter; /* interval in frames/uframes */ unsigned int ai_interval; /* commands */ u8 *dux_commands; struct mutex mut; }; static void usbdux_unlink_urbs(struct urb **urbs, int num_urbs) { int i; for (i = 0; i < num_urbs; i++) usb_kill_urb(urbs[i]); } static void usbdux_ai_stop(struct comedi_device *dev, int do_unlink) { struct usbdux_private *devpriv = dev->private; if (do_unlink && devpriv->ai_urbs) usbdux_unlink_urbs(devpriv->ai_urbs, devpriv->n_ai_urbs); devpriv->ai_cmd_running = 0; } static int usbdux_ai_cancel(struct comedi_device *dev, struct comedi_subdevice *s) { struct usbdux_private *devpriv = dev->private; /* prevent other CPUs from submitting new commands just now */ mutex_lock(&devpriv->mut); /* unlink only if the urb really has been submitted */ usbdux_ai_stop(dev, devpriv->ai_cmd_running); mutex_unlock(&devpriv->mut); return 0; } static void usbduxsub_ai_handle_urb(struct comedi_device *dev, struct comedi_subdevice *s, struct urb *urb) { struct usbdux_private *devpriv = dev->private; struct comedi_async *async = s->async; struct comedi_cmd *cmd = &async->cmd; int ret; int i; devpriv->ai_counter--; if (devpriv->ai_counter == 0) { devpriv->ai_counter = devpriv->ai_timer; /* get the data from the USB bus and hand it over to comedi */ for (i = 0; i < cmd->chanlist_len; i++) { unsigned int range = CR_RANGE(cmd->chanlist[i]); u16 val = le16_to_cpu(devpriv->in_buf[i]); /* bipolar data is two's-complement */ if (comedi_range_is_bipolar(s, range)) val = comedi_offset_munge(s, val); /* transfer data */ if (!comedi_buf_write_samples(s, &val, 1)) return; } if (cmd->stop_src == TRIG_COUNT && async->scans_done >= cmd->stop_arg) async->events |= COMEDI_CB_EOA; } /* if command is still running, resubmit urb */ if (!(async->events & COMEDI_CB_CANCEL_MASK)) { urb->dev = comedi_to_usb_dev(dev); ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret < 0) { dev_err(dev->class_dev, "urb resubmit failed in int-context! err=%d\n", ret); if (ret == -EL2NSYNC) dev_err(dev->class_dev, "buggy USB host controller or bug in IRQ handler!\n"); async->events |= COMEDI_CB_ERROR; } } } static void usbduxsub_ai_isoc_irq(struct urb *urb) { struct comedi_device *dev = urb->context; struct comedi_subdevice *s = dev->read_subdev; struct comedi_async *async = s->async; struct usbdux_private *devpriv = dev->private; /* exit if not running a command, do not resubmit urb */ if (!devpriv->ai_cmd_running) return; switch (urb->status) { case 0: /* copy the result in the transfer buffer */ memcpy(devpriv->in_buf, urb->transfer_buffer, SIZEINBUF); usbduxsub_ai_handle_urb(dev, s, urb); break; case -EILSEQ: /* * error in the ISOchronous data * we don't copy the data into the transfer buffer * and recycle the last data byte */ dev_dbg(dev->class_dev, "CRC error in ISO IN stream\n"); usbduxsub_ai_handle_urb(dev, s, urb); break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: case -ECONNABORTED: /* after an unlink command, unplug, ... etc */ async->events |= COMEDI_CB_ERROR; break; default: /* a real error */ dev_err(dev->class_dev, "Non-zero urb status received in ai intr context: %d\n", urb->status); async->events |= COMEDI_CB_ERROR; break; } /* * comedi_handle_events() cannot be used in this driver. The (*cancel) * operation would unlink the urb. */ if (async->events & COMEDI_CB_CANCEL_MASK) usbdux_ai_stop(dev, 0); comedi_event(dev, s); } static void usbdux_ao_stop(struct comedi_device *dev, int do_unlink) { struct usbdux_private *devpriv = dev->private; if (do_unlink && devpriv->ao_urbs) usbdux_unlink_urbs(devpriv->ao_urbs, devpriv->n_ao_urbs); devpriv->ao_cmd_running = 0; } static int usbdux_ao_cancel(struct comedi_device *dev, struct comedi_subdevice *s) { struct usbdux_private *devpriv = dev->private; /* prevent other CPUs from submitting a command just now */ mutex_lock(&devpriv->mut); /* unlink only if it is really running */ usbdux_ao_stop(dev, devpriv->ao_cmd_running); mutex_unlock(&devpriv->mut); return 0; } static void usbduxsub_ao_handle_urb(struct comedi_device *dev, struct comedi_subdevice *s, struct urb *urb) { struct usbdux_private *devpriv = dev->private; struct comedi_async *async = s->async; struct comedi_cmd *cmd = &async->cmd; u8 *datap; int ret; int i; devpriv->ao_counter--; if (devpriv->ao_counter == 0) { devpriv->ao_counter = devpriv->ao_timer; if (cmd->stop_src == TRIG_COUNT && async->scans_done >= cmd->stop_arg) { async->events |= COMEDI_CB_EOA; return; } /* transmit data to the USB bus */ datap = urb->transfer_buffer; *datap++ = cmd->chanlist_len; for (i = 0; i < cmd->chanlist_len; i++) { unsigned int chan = CR_CHAN(cmd->chanlist[i]); unsigned short val; if (!comedi_buf_read_samples(s, &val, 1)) { dev_err(dev->class_dev, "buffer underflow\n"); async->events |= COMEDI_CB_OVERFLOW; return; } /* pointer to the DA */ *datap++ = val & 0xff; *datap++ = (val >> 8) & 0xff; *datap++ = chan << 6; s->readback[chan] = val; } } /* if command is still running, resubmit urb for BULK transfer */ if (!(async->events & COMEDI_CB_CANCEL_MASK)) { urb->transfer_buffer_length = SIZEOUTBUF; urb->dev = comedi_to_usb_dev(dev); urb->status = 0; if (devpriv->high_speed) urb->interval = 8; /* uframes */ else urb->interval = 1; /* frames */ urb->number_of_packets = 1; urb->iso_frame_desc[0].offset = 0; urb->iso_frame_desc[0].length = SIZEOUTBUF; urb->iso_frame_desc[0].status = 0; ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret < 0) { dev_err(dev->class_dev, "ao urb resubm failed in int-cont. ret=%d", ret); if (ret == -EL2NSYNC) dev_err(dev->class_dev, "buggy USB host controller or bug in IRQ handling!\n"); async->events |= COMEDI_CB_ERROR; } } } static void usbduxsub_ao_isoc_irq(struct urb *urb) { struct comedi_device *dev = urb->context; struct comedi_subdevice *s = dev->write_subdev; struct comedi_async *async = s->async; struct usbdux_private *devpriv = dev->private; /* exit if not running a command, do not resubmit urb */ if (!devpriv->ao_cmd_running) return; switch (urb->status) { case 0: usbduxsub_ao_handle_urb(dev, s, urb); break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: case -ECONNABORTED: /* after an unlink command, unplug, ... etc */ async->events |= COMEDI_CB_ERROR; break; default: /* a real error */ dev_err(dev->class_dev, "Non-zero urb status received in ao intr context: %d\n", urb->status); async->events |= COMEDI_CB_ERROR; break; } /* * comedi_handle_events() cannot be used in this driver. The (*cancel) * operation would unlink the urb. */ if (async->events & COMEDI_CB_CANCEL_MASK) usbdux_ao_stop(dev, 0); comedi_event(dev, s); } static int usbdux_submit_urbs(struct comedi_device *dev, struct urb **urbs, int num_urbs, int input_urb) { struct usb_device *usb = comedi_to_usb_dev(dev); struct usbdux_private *devpriv = dev->private; struct urb *urb; int ret; int i; /* Submit all URBs and start the transfer on the bus */ for (i = 0; i < num_urbs; i++) { urb = urbs[i]; /* in case of a resubmission after an unlink... */ if (input_urb) urb->interval = devpriv->ai_interval; urb->context = dev; urb->dev = usb; urb->status = 0; urb->transfer_flags = URB_ISO_ASAP; ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret) return ret; } return 0; } static int usbdux_ai_cmdtest(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_cmd *cmd) { struct usbdux_private *devpriv = dev->private; int err = 0; /* Step 1 : check if triggers are trivially valid */ err |= comedi_check_trigger_src(&cmd->start_src, TRIG_NOW | TRIG_INT); err |= comedi_check_trigger_src(&cmd->scan_begin_src, TRIG_TIMER); err |= comedi_check_trigger_src(&cmd->convert_src, TRIG_NOW); err |= comedi_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT); err |= comedi_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE); if (err) return 1; /* Step 2a : make sure trigger sources are unique */ err |= comedi_check_trigger_is_unique(cmd->start_src); err |= comedi_check_trigger_is_unique(cmd->stop_src); /* Step 2b : and mutually compatible */ if (err) return 2; /* Step 3: check if arguments are trivially valid */ err |= comedi_check_trigger_arg_is(&cmd->start_arg, 0); if (cmd->scan_begin_src == TRIG_FOLLOW) /* internal trigger */ err |= comedi_check_trigger_arg_is(&cmd->scan_begin_arg, 0); if (cmd->scan_begin_src == TRIG_TIMER) { /* full speed does 1kHz scans every USB frame */ unsigned int arg = 1000000; unsigned int min_arg = arg; if (devpriv->high_speed) { /* * In high speed mode microframes are possible. * However, during one microframe we can roughly * sample one channel. Thus, the more channels * are in the channel list the more time we need. */ int i = 1; /* find a power of 2 for the number of channels */ while (i < cmd->chanlist_len) i = i * 2; arg /= 8; min_arg = arg * i; } err |= comedi_check_trigger_arg_min(&cmd->scan_begin_arg, min_arg); /* calc the real sampling rate with the rounding errors */ arg = (cmd->scan_begin_arg / arg) * arg; err |= comedi_check_trigger_arg_is(&cmd->scan_begin_arg, arg); } err |= comedi_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len); if (cmd->stop_src == TRIG_COUNT) err |= comedi_check_trigger_arg_min(&cmd->stop_arg, 1); else /* TRIG_NONE */ err |= comedi_check_trigger_arg_is(&cmd->stop_arg, 0); if (err) return 3; return 0; } /* * creates the ADC command for the MAX1271 * range is the range value from comedi */ static u8 create_adc_command(unsigned int chan, unsigned int range) { u8 p = (range <= 1); u8 r = ((range % 2) == 0); return (chan << 4) | ((p == 1) << 2) | ((r == 1) << 3); } static int send_dux_commands(struct comedi_device *dev, unsigned int cmd_type) { struct usb_device *usb = comedi_to_usb_dev(dev); struct usbdux_private *devpriv = dev->private; int nsent; devpriv->dux_commands[0] = cmd_type; return usb_bulk_msg(usb, usb_sndbulkpipe(usb, 1), devpriv->dux_commands, SIZEOFDUXBUFFER, &nsent, BULK_TIMEOUT); } static int receive_dux_commands(struct comedi_device *dev, unsigned int command) { struct usb_device *usb = comedi_to_usb_dev(dev); struct usbdux_private *devpriv = dev->private; int ret; int nrec; int i; for (i = 0; i < RETRIES; i++) { ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, 8), devpriv->insn_buf, SIZEINSNBUF, &nrec, BULK_TIMEOUT); if (ret < 0) return ret; if (le16_to_cpu(devpriv->insn_buf[0]) == command) return ret; } /* command not received */ return -EFAULT; } static int usbdux_ai_inttrig(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int trig_num) { struct usbdux_private *devpriv = dev->private; struct comedi_cmd *cmd = &s->async->cmd; int ret; if (trig_num != cmd->start_arg) return -EINVAL; mutex_lock(&devpriv->mut); if (!devpriv->ai_cmd_running) { devpriv->ai_cmd_running = 1; ret = usbdux_submit_urbs(dev, devpriv->ai_urbs, devpriv->n_ai_urbs, 1); if (ret < 0) { devpriv->ai_cmd_running = 0; goto ai_trig_exit; } s->async->inttrig = NULL; } else { ret = -EBUSY; } ai_trig_exit: mutex_unlock(&devpriv->mut); return ret; } static int usbdux_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s) { struct usbdux_private *devpriv = dev->private; struct comedi_cmd *cmd = &s->async->cmd; int len = cmd->chanlist_len; int ret = -EBUSY; int i; /* block other CPUs from starting an ai_cmd */ mutex_lock(&devpriv->mut); if (devpriv->ai_cmd_running) goto ai_cmd_exit; devpriv->dux_commands[1] = len; for (i = 0; i < len; ++i) { unsigned int chan = CR_CHAN(cmd->chanlist[i]); unsigned int range = CR_RANGE(cmd->chanlist[i]); devpriv->dux_commands[i + 2] = create_adc_command(chan, range); } ret = send_dux_commands(dev, USBDUX_CMD_MULT_AI); if (ret < 0) goto ai_cmd_exit; if (devpriv->high_speed) { /* * every channel gets a time window of 125us. Thus, if we * sample all 8 channels we need 1ms. If we sample only one * channel we need only 125us */ devpriv->ai_interval = 1; /* find a power of 2 for the interval */ while (devpriv->ai_interval < len) devpriv->ai_interval *= 2; devpriv->ai_timer = cmd->scan_begin_arg / (125000 * devpriv->ai_interval); } else { /* interval always 1ms */ devpriv->ai_interval = 1; devpriv->ai_timer = cmd->scan_begin_arg / 1000000; } if (devpriv->ai_timer < 1) { ret = -EINVAL; goto ai_cmd_exit; } devpriv->ai_counter = devpriv->ai_timer; if (cmd->start_src == TRIG_NOW) { /* enable this acquisition operation */ devpriv->ai_cmd_running = 1; ret = usbdux_submit_urbs(dev, devpriv->ai_urbs, devpriv->n_ai_urbs, 1); if (ret < 0) { devpriv->ai_cmd_running = 0; /* fixme: unlink here?? */ goto ai_cmd_exit; } s->async->inttrig = NULL; } else { /* TRIG_INT */ /* don't enable the acquision operation */ /* wait for an internal signal */ s->async->inttrig = usbdux_ai_inttrig; } ai_cmd_exit: mutex_unlock(&devpriv->mut); return ret; } /* Mode 0 is used to get a single conversion on demand */ static int usbdux_ai_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); unsigned int range = CR_RANGE(insn->chanspec); unsigned int val; int ret = -EBUSY; int i; mutex_lock(&devpriv->mut); if (devpriv->ai_cmd_running) goto ai_read_exit; /* set command for the first channel */ devpriv->dux_commands[1] = create_adc_command(chan, range); /* adc commands */ ret = send_dux_commands(dev, USBDUX_CMD_SINGLE_AI); if (ret < 0) goto ai_read_exit; for (i = 0; i < insn->n; i++) { ret = receive_dux_commands(dev, USBDUX_CMD_SINGLE_AI); if (ret < 0) goto ai_read_exit; val = le16_to_cpu(devpriv->insn_buf[1]); /* bipolar data is two's-complement */ if (comedi_range_is_bipolar(s, range)) val = comedi_offset_munge(s, val); data[i] = val; } ai_read_exit: mutex_unlock(&devpriv->mut); return ret ? ret : insn->n; } static int usbdux_ao_insn_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; int ret; mutex_lock(&devpriv->mut); ret = comedi_readback_insn_read(dev, s, insn, data); mutex_unlock(&devpriv->mut); return ret; } static int usbdux_ao_insn_write(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); __le16 *p = (__le16 *)&devpriv->dux_commands[2]; int ret = -EBUSY; int i; mutex_lock(&devpriv->mut); if (devpriv->ao_cmd_running) goto ao_write_exit; /* number of channels: 1 */ devpriv->dux_commands[1] = 1; /* channel number */ devpriv->dux_commands[4] = chan << 6; for (i = 0; i < insn->n; i++) { unsigned int val = data[i]; /* one 16 bit value */ *p = cpu_to_le16(val); ret = send_dux_commands(dev, USBDUX_CMD_AO); if (ret < 0) goto ao_write_exit; s->readback[chan] = val; } ao_write_exit: mutex_unlock(&devpriv->mut); return ret ? ret : insn->n; } static int usbdux_ao_inttrig(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int trig_num) { struct usbdux_private *devpriv = dev->private; struct comedi_cmd *cmd = &s->async->cmd; int ret; if (trig_num != cmd->start_arg) return -EINVAL; mutex_lock(&devpriv->mut); if (!devpriv->ao_cmd_running) { devpriv->ao_cmd_running = 1; ret = usbdux_submit_urbs(dev, devpriv->ao_urbs, devpriv->n_ao_urbs, 0); if (ret < 0) { devpriv->ao_cmd_running = 0; goto ao_trig_exit; } s->async->inttrig = NULL; } else { ret = -EBUSY; } ao_trig_exit: mutex_unlock(&devpriv->mut); return ret; } static int usbdux_ao_cmdtest(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_cmd *cmd) { int err = 0; unsigned int flags; /* Step 1 : check if triggers are trivially valid */ err |= comedi_check_trigger_src(&cmd->start_src, TRIG_NOW | TRIG_INT); if (0) { /* (devpriv->high_speed) */ /* the sampling rate is set by the coversion rate */ flags = TRIG_FOLLOW; } else { /* start a new scan (output at once) with a timer */ flags = TRIG_TIMER; } err |= comedi_check_trigger_src(&cmd->scan_begin_src, flags); if (0) { /* (devpriv->high_speed) */ /* * in usb-2.0 only one conversion it transmitted * but with 8kHz/n */ flags = TRIG_TIMER; } else { /* * all conversion events happen simultaneously with * a rate of 1kHz/n */ flags = TRIG_NOW; } err |= comedi_check_trigger_src(&cmd->convert_src, flags); err |= comedi_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT); err |= comedi_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE); if (err) return 1; /* Step 2a : make sure trigger sources are unique */ err |= comedi_check_trigger_is_unique(cmd->start_src); err |= comedi_check_trigger_is_unique(cmd->stop_src); /* Step 2b : and mutually compatible */ if (err) return 2; /* Step 3: check if arguments are trivially valid */ err |= comedi_check_trigger_arg_is(&cmd->start_arg, 0); if (cmd->scan_begin_src == TRIG_FOLLOW) /* internal trigger */ err |= comedi_check_trigger_arg_is(&cmd->scan_begin_arg, 0); if (cmd->scan_begin_src == TRIG_TIMER) { err |= comedi_check_trigger_arg_min(&cmd->scan_begin_arg, 1000000); } /* not used now, is for later use */ if (cmd->convert_src == TRIG_TIMER) err |= comedi_check_trigger_arg_min(&cmd->convert_arg, 125000); err |= comedi_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len); if (cmd->stop_src == TRIG_COUNT) err |= comedi_check_trigger_arg_min(&cmd->stop_arg, 1); else /* TRIG_NONE */ err |= comedi_check_trigger_arg_is(&cmd->stop_arg, 0); if (err) return 3; return 0; } static int usbdux_ao_cmd(struct comedi_device *dev, struct comedi_subdevice *s) { struct usbdux_private *devpriv = dev->private; struct comedi_cmd *cmd = &s->async->cmd; int ret = -EBUSY; mutex_lock(&devpriv->mut); if (devpriv->ao_cmd_running) goto ao_cmd_exit; /* we count in steps of 1ms (125us) */ /* 125us mode not used yet */ if (0) { /* (devpriv->high_speed) */ /* 125us */ /* timing of the conversion itself: every 125 us */ devpriv->ao_timer = cmd->convert_arg / 125000; } else { /* 1ms */ /* timing of the scan: we get all channels at once */ devpriv->ao_timer = cmd->scan_begin_arg / 1000000; if (devpriv->ao_timer < 1) { ret = -EINVAL; goto ao_cmd_exit; } } devpriv->ao_counter = devpriv->ao_timer; if (cmd->start_src == TRIG_NOW) { /* enable this acquisition operation */ devpriv->ao_cmd_running = 1; ret = usbdux_submit_urbs(dev, devpriv->ao_urbs, devpriv->n_ao_urbs, 0); if (ret < 0) { devpriv->ao_cmd_running = 0; /* fixme: unlink here?? */ goto ao_cmd_exit; } s->async->inttrig = NULL; } else { /* TRIG_INT */ /* submit the urbs later */ /* wait for an internal signal */ s->async->inttrig = usbdux_ao_inttrig; } ao_cmd_exit: mutex_unlock(&devpriv->mut); return ret; } static int usbdux_dio_insn_config(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { int ret; ret = comedi_dio_insn_config(dev, s, insn, data, 0); if (ret) return ret; /* * We don't tell the firmware here as it would take 8 frames * to submit the information. We do it in the insn_bits. */ return insn->n; } static int usbdux_dio_insn_bits(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; int ret; mutex_lock(&devpriv->mut); comedi_dio_update_state(s, data); /* Always update the hardware. See the (*insn_config). */ devpriv->dux_commands[1] = s->io_bits; devpriv->dux_commands[2] = s->state; /* * This command also tells the firmware to return * the digital input lines. */ ret = send_dux_commands(dev, USBDUX_CMD_DIO_BITS); if (ret < 0) goto dio_exit; ret = receive_dux_commands(dev, USBDUX_CMD_DIO_BITS); if (ret < 0) goto dio_exit; data[1] = le16_to_cpu(devpriv->insn_buf[1]); dio_exit: mutex_unlock(&devpriv->mut); return ret ? ret : insn->n; } static int usbdux_counter_read(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); int ret = 0; int i; mutex_lock(&devpriv->mut); for (i = 0; i < insn->n; i++) { ret = send_dux_commands(dev, USBDUX_CMD_TIMER_RD); if (ret < 0) goto counter_read_exit; ret = receive_dux_commands(dev, USBDUX_CMD_TIMER_RD); if (ret < 0) goto counter_read_exit; data[i] = le16_to_cpu(devpriv->insn_buf[chan + 1]); } counter_read_exit: mutex_unlock(&devpriv->mut); return ret ? ret : insn->n; } static int usbdux_counter_write(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); __le16 *p = (__le16 *)&devpriv->dux_commands[2]; int ret = 0; int i; mutex_lock(&devpriv->mut); devpriv->dux_commands[1] = chan; for (i = 0; i < insn->n; i++) { *p = cpu_to_le16(data[i]); ret = send_dux_commands(dev, USBDUX_CMD_TIMER_WR); if (ret < 0) break; } mutex_unlock(&devpriv->mut); return ret ? ret : insn->n; } static int usbdux_counter_config(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { /* nothing to do so far */ return 2; } static void usbduxsub_unlink_pwm_urbs(struct comedi_device *dev) { struct usbdux_private *devpriv = dev->private; usb_kill_urb(devpriv->pwm_urb); } static void usbdux_pwm_stop(struct comedi_device *dev, int do_unlink) { struct usbdux_private *devpriv = dev->private; if (do_unlink) usbduxsub_unlink_pwm_urbs(dev); devpriv->pwm_cmd_running = 0; } static int usbdux_pwm_cancel(struct comedi_device *dev, struct comedi_subdevice *s) { struct usbdux_private *devpriv = dev->private; int ret; mutex_lock(&devpriv->mut); /* unlink only if it is really running */ usbdux_pwm_stop(dev, devpriv->pwm_cmd_running); ret = send_dux_commands(dev, USBDUX_CMD_PWM_OFF); mutex_unlock(&devpriv->mut); return ret; } static void usbduxsub_pwm_irq(struct urb *urb) { struct comedi_device *dev = urb->context; struct usbdux_private *devpriv = dev->private; int ret; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: case -ECONNABORTED: /* * after an unlink command, unplug, ... etc * no unlink needed here. Already shutting down. */ if (devpriv->pwm_cmd_running) usbdux_pwm_stop(dev, 0); return; default: /* a real error */ if (devpriv->pwm_cmd_running) { dev_err(dev->class_dev, "Non-zero urb status received in pwm intr context: %d\n", urb->status); usbdux_pwm_stop(dev, 0); } return; } /* are we actually running? */ if (!devpriv->pwm_cmd_running) return; urb->transfer_buffer_length = devpriv->pwm_buf_sz; urb->dev = comedi_to_usb_dev(dev); urb->status = 0; if (devpriv->pwm_cmd_running) { ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret < 0) { dev_err(dev->class_dev, "pwm urb resubm failed in int-cont. ret=%d", ret); if (ret == -EL2NSYNC) dev_err(dev->class_dev, "buggy USB host controller or bug in IRQ handling!\n"); /* don't do an unlink here */ usbdux_pwm_stop(dev, 0); } } } static int usbduxsub_submit_pwm_urbs(struct comedi_device *dev) { struct usb_device *usb = comedi_to_usb_dev(dev); struct usbdux_private *devpriv = dev->private; struct urb *urb = devpriv->pwm_urb; /* in case of a resubmission after an unlink... */ usb_fill_bulk_urb(urb, usb, usb_sndbulkpipe(usb, 4), urb->transfer_buffer, devpriv->pwm_buf_sz, usbduxsub_pwm_irq, dev); return usb_submit_urb(urb, GFP_ATOMIC); } static int usbdux_pwm_period(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int period) { struct usbdux_private *devpriv = dev->private; int fx2delay; if (period < MIN_PWM_PERIOD) return -EAGAIN; fx2delay = (period / (6 * 512 * 1000 / 33)) - 6; if (fx2delay > 255) return -EAGAIN; devpriv->pwm_delay = fx2delay; devpriv->pwm_period = period; return 0; } static int usbdux_pwm_start(struct comedi_device *dev, struct comedi_subdevice *s) { struct usbdux_private *devpriv = dev->private; int ret = 0; mutex_lock(&devpriv->mut); if (devpriv->pwm_cmd_running) goto pwm_start_exit; devpriv->dux_commands[1] = devpriv->pwm_delay; ret = send_dux_commands(dev, USBDUX_CMD_PWM_ON); if (ret < 0) goto pwm_start_exit; /* initialise the buffer */ memset(devpriv->pwm_urb->transfer_buffer, 0, devpriv->pwm_buf_sz); devpriv->pwm_cmd_running = 1; ret = usbduxsub_submit_pwm_urbs(dev); if (ret < 0) devpriv->pwm_cmd_running = 0; pwm_start_exit: mutex_unlock(&devpriv->mut); return ret; } static void usbdux_pwm_pattern(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int chan, unsigned int value, unsigned int sign) { struct usbdux_private *devpriv = dev->private; char pwm_mask = (1 << chan); /* DIO bit for the PWM data */ char sgn_mask = (16 << chan); /* DIO bit for the sign */ char *buf = (char *)(devpriv->pwm_urb->transfer_buffer); int szbuf = devpriv->pwm_buf_sz; int i; for (i = 0; i < szbuf; i++) { char c = *buf; c &= ~pwm_mask; if (i < value) c |= pwm_mask; if (!sign) c &= ~sgn_mask; else c |= sgn_mask; *buf++ = c; } } static int usbdux_pwm_write(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { unsigned int chan = CR_CHAN(insn->chanspec); /* * It doesn't make sense to support more than one value here * because it would just overwrite the PWM buffer. */ if (insn->n != 1) return -EINVAL; /* * The sign is set via a special INSN only, this gives us 8 bits * for normal operation, sign is 0 by default. */ usbdux_pwm_pattern(dev, s, chan, data[0], 0); return insn->n; } static int usbdux_pwm_config(struct comedi_device *dev, struct comedi_subdevice *s, struct comedi_insn *insn, unsigned int *data) { struct usbdux_private *devpriv = dev->private; unsigned int chan = CR_CHAN(insn->chanspec); switch (data[0]) { case INSN_CONFIG_ARM: /* * if not zero the PWM is limited to a certain time which is * not supported here */ if (data[1] != 0) return -EINVAL; return usbdux_pwm_start(dev, s); case INSN_CONFIG_DISARM: return usbdux_pwm_cancel(dev, s); case INSN_CONFIG_GET_PWM_STATUS: data[1] = devpriv->pwm_cmd_running; return 0; case INSN_CONFIG_PWM_SET_PERIOD: return usbdux_pwm_period(dev, s, data[1]); case INSN_CONFIG_PWM_GET_PERIOD: data[1] = devpriv->pwm_period; return 0; case INSN_CONFIG_PWM_SET_H_BRIDGE: /* * data[1] = value * data[2] = sign (for a relay) */ usbdux_pwm_pattern(dev, s, chan, data[1], (data[2] != 0)); return 0; case INSN_CONFIG_PWM_GET_H_BRIDGE: /* values are not kept in this driver, nothing to return here */ return -EINVAL; } return -EINVAL; } static int usbdux_firmware_upload(struct comedi_device *dev, const u8 *data, size_t size, unsigned long context) { struct usb_device *usb = comedi_to_usb_dev(dev); u8 *buf; u8 *tmp; int ret; if (!data) return 0; if (size > USBDUX_FIRMWARE_MAX_LEN) { dev_err(dev->class_dev, "usbdux firmware binary it too large for FX2.\n"); return -ENOMEM; } /* we generate a local buffer for the firmware */ buf = kmemdup(data, size, GFP_KERNEL); if (!buf) return -ENOMEM; /* we need a malloc'ed buffer for usb_control_msg() */ tmp = kmalloc(1, GFP_KERNEL); if (!tmp) { kfree(buf); return -ENOMEM; } /* stop the current firmware on the device */ *tmp = 1; /* 7f92 to one */ ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0), USBDUX_FIRMWARE_CMD, VENDOR_DIR_OUT, USBDUX_CPU_CS, 0x0000, tmp, 1, BULK_TIMEOUT); if (ret < 0) { dev_err(dev->class_dev, "can not stop firmware\n"); goto done; } /* upload the new firmware to the device */ ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0), USBDUX_FIRMWARE_CMD, VENDOR_DIR_OUT, 0, 0x0000, buf, size, BULK_TIMEOUT); if (ret < 0) { dev_err(dev->class_dev, "firmware upload failed\n"); goto done; } /* start the new firmware on the device */ *tmp = 0; /* 7f92 to zero */ ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0), USBDUX_FIRMWARE_CMD, VENDOR_DIR_OUT, USBDUX_CPU_CS, 0x0000, tmp, 1, BULK_TIMEOUT); if (ret < 0) dev_err(dev->class_dev, "can not start firmware\n"); done: kfree(tmp); kfree(buf); return ret; } static int usbdux_alloc_usb_buffers(struct comedi_device *dev) { struct usb_device *usb = comedi_to_usb_dev(dev); struct usbdux_private *devpriv = dev->private; struct urb *urb; int i; devpriv->dux_commands = kzalloc(SIZEOFDUXBUFFER, GFP_KERNEL); devpriv->in_buf = kzalloc(SIZEINBUF, GFP_KERNEL); devpriv->insn_buf = kzalloc(SIZEINSNBUF, GFP_KERNEL); devpriv->ai_urbs = kcalloc(devpriv->n_ai_urbs, sizeof(void *), GFP_KERNEL); devpriv->ao_urbs = kcalloc(devpriv->n_ao_urbs, sizeof(void *), GFP_KERNEL); if (!devpriv->dux_commands || !devpriv->in_buf || !devpriv->insn_buf || !devpriv->ai_urbs || !devpriv->ao_urbs) return -ENOMEM; for (i = 0; i < devpriv->n_ai_urbs; i++) { /* one frame: 1ms */ urb = usb_alloc_urb(1, GFP_KERNEL); if (!urb) return -ENOMEM; devpriv->ai_urbs[i] = urb; urb->dev = usb; urb->context = dev; urb->pipe = usb_rcvisocpipe(usb, 6); urb->transfer_flags = URB_ISO_ASAP; urb->transfer_buffer = kzalloc(SIZEINBUF, GFP_KERNEL); if (!urb->transfer_buffer) return -ENOMEM; urb->complete = usbduxsub_ai_isoc_irq; urb->number_of_packets = 1; urb->transfer_buffer_length = SIZEINBUF; urb->iso_frame_desc[0].offset = 0; urb->iso_frame_desc[0].length = SIZEINBUF; } for (i = 0; i < devpriv->n_ao_urbs; i++) { /* one frame: 1ms */ urb = usb_alloc_urb(1, GFP_KERNEL); if (!urb) return -ENOMEM; devpriv->ao_urbs[i] = urb; urb->dev = usb; urb->context = dev; urb->pipe = usb_sndisocpipe(usb, 2); urb->transfer_flags = URB_ISO_ASAP; urb->transfer_buffer = kzalloc(SIZEOUTBUF, GFP_KERNEL); if (!urb->transfer_buffer) return -ENOMEM; urb->complete = usbduxsub_ao_isoc_irq; urb->number_of_packets = 1; urb->transfer_buffer_length = SIZEOUTBUF; urb->iso_frame_desc[0].offset = 0; urb->iso_frame_desc[0].length = SIZEOUTBUF; if (devpriv->high_speed) urb->interval = 8; /* uframes */ else urb->interval = 1; /* frames */ } /* pwm */ if (devpriv->pwm_buf_sz) { urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) return -ENOMEM; devpriv->pwm_urb = urb; /* max bulk ep size in high speed */ urb->transfer_buffer = kzalloc(devpriv->pwm_buf_sz, GFP_KERNEL); if (!urb->transfer_buffer) return -ENOMEM; } return 0; } static void usbdux_free_usb_buffers(struct comedi_device *dev) { struct usbdux_private *devpriv = dev->private; struct urb *urb; int i; urb = devpriv->pwm_urb; if (urb) { kfree(urb->transfer_buffer); usb_free_urb(urb); } if (devpriv->ao_urbs) { for (i = 0; i < devpriv->n_ao_urbs; i++) { urb = devpriv->ao_urbs[i]; if (urb) { kfree(urb->transfer_buffer); usb_free_urb(urb); } } kfree(devpriv->ao_urbs); } if (devpriv->ai_urbs) { for (i = 0; i < devpriv->n_ai_urbs; i++) { urb = devpriv->ai_urbs[i]; if (urb) { kfree(urb->transfer_buffer); usb_free_urb(urb); } } kfree(devpriv->ai_urbs); } kfree(devpriv->insn_buf); kfree(devpriv->in_buf); kfree(devpriv->dux_commands); } static int usbdux_auto_attach(struct comedi_device *dev, unsigned long context_unused) { struct usb_interface *intf = comedi_to_usb_interface(dev); struct usb_device *usb = comedi_to_usb_dev(dev); struct usbdux_private *devpriv; struct comedi_subdevice *s; int ret; devpriv = comedi_alloc_devpriv(dev, sizeof(*devpriv)); if (!devpriv) return -ENOMEM; mutex_init(&devpriv->mut); usb_set_intfdata(intf, devpriv); devpriv->high_speed = (usb->speed == USB_SPEED_HIGH); if (devpriv->high_speed) { devpriv->n_ai_urbs = NUMOFINBUFFERSHIGH; devpriv->n_ao_urbs = NUMOFOUTBUFFERSHIGH; devpriv->pwm_buf_sz = 512; } else { devpriv->n_ai_urbs = NUMOFINBUFFERSFULL; devpriv->n_ao_urbs = NUMOFOUTBUFFERSFULL; } ret = usbdux_alloc_usb_buffers(dev); if (ret) return ret; /* setting to alternate setting 3: enabling iso ep and bulk ep. */ ret = usb_set_interface(usb, intf->altsetting->desc.bInterfaceNumber, 3); if (ret < 0) { dev_err(dev->class_dev, "could not set alternate setting 3 in high speed\n"); return ret; } ret = comedi_load_firmware(dev, &usb->dev, USBDUX_FIRMWARE, usbdux_firmware_upload, 0); if (ret < 0) return ret; ret = comedi_alloc_subdevices(dev, (devpriv->high_speed) ? 5 : 4); if (ret) return ret; /* Analog Input subdevice */ s = &dev->subdevices[0]; dev->read_subdev = s; s->type = COMEDI_SUBD_AI; s->subdev_flags = SDF_READABLE | SDF_GROUND | SDF_CMD_READ; s->n_chan = 8; s->maxdata = 0x0fff; s->len_chanlist = 8; s->range_table = &range_usbdux_ai_range; s->insn_read = usbdux_ai_insn_read; s->do_cmdtest = usbdux_ai_cmdtest; s->do_cmd = usbdux_ai_cmd; s->cancel = usbdux_ai_cancel; /* Analog Output subdevice */ s = &dev->subdevices[1]; dev->write_subdev = s; s->type = COMEDI_SUBD_AO; s->subdev_flags = SDF_WRITABLE | SDF_GROUND | SDF_CMD_WRITE; s->n_chan = 4; s->maxdata = 0x0fff; s->len_chanlist = s->n_chan; s->range_table = &range_usbdux_ao_range; s->do_cmdtest = usbdux_ao_cmdtest; s->do_cmd = usbdux_ao_cmd; s->cancel = usbdux_ao_cancel; s->insn_read = usbdux_ao_insn_read; s->insn_write = usbdux_ao_insn_write; ret = comedi_alloc_subdev_readback(s); if (ret) return ret; /* Digital I/O subdevice */ s = &dev->subdevices[2]; s->type = COMEDI_SUBD_DIO; s->subdev_flags = SDF_READABLE | SDF_WRITABLE; s->n_chan = 8; s->maxdata = 1; s->range_table = &range_digital; s->insn_bits = usbdux_dio_insn_bits; s->insn_config = usbdux_dio_insn_config; /* Counter subdevice */ s = &dev->subdevices[3]; s->type = COMEDI_SUBD_COUNTER; s->subdev_flags = SDF_WRITABLE | SDF_READABLE; s->n_chan = 4; s->maxdata = 0xffff; s->insn_read = usbdux_counter_read; s->insn_write = usbdux_counter_write; s->insn_config = usbdux_counter_config; if (devpriv->high_speed) { /* PWM subdevice */ s = &dev->subdevices[4]; s->type = COMEDI_SUBD_PWM; s->subdev_flags = SDF_WRITABLE | SDF_PWM_HBRIDGE; s->n_chan = 8; s->maxdata = devpriv->pwm_buf_sz; s->insn_write = usbdux_pwm_write; s->insn_config = usbdux_pwm_config; usbdux_pwm_period(dev, s, PWM_DEFAULT_PERIOD); } return 0; } static void usbdux_detach(struct comedi_device *dev) { struct usb_interface *intf = comedi_to_usb_interface(dev); struct usbdux_private *devpriv = dev->private; usb_set_intfdata(intf, NULL); if (!devpriv) return; mutex_lock(&devpriv->mut); /* force unlink all urbs */ usbdux_pwm_stop(dev, 1); usbdux_ao_stop(dev, 1); usbdux_ai_stop(dev, 1); usbdux_free_usb_buffers(dev); mutex_unlock(&devpriv->mut); mutex_destroy(&devpriv->mut); } static struct comedi_driver usbdux_driver = { .driver_name = "usbdux", .module = THIS_MODULE, .auto_attach = usbdux_auto_attach, .detach = usbdux_detach, }; static int usbdux_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { return comedi_usb_auto_config(intf, &usbdux_driver, 0); } static const struct usb_device_id usbdux_usb_table[] = { { USB_DEVICE(0x13d8, 0x0001) }, { USB_DEVICE(0x13d8, 0x0002) }, { } }; MODULE_DEVICE_TABLE(usb, usbdux_usb_table); static struct usb_driver usbdux_usb_driver = { .name = "usbdux", .probe = usbdux_usb_probe, .disconnect = comedi_usb_auto_unconfig, .id_table = usbdux_usb_table, }; module_comedi_usb_driver(usbdux_driver, usbdux_usb_driver); MODULE_AUTHOR("Bernd Porr, BerndPorr@f2s.com"); MODULE_DESCRIPTION("Stirling/ITL USB-DUX -- Bernd.Porr@f2s.com"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(USBDUX_FIRMWARE); |
| 56 3 10 10 10 14 10 13 12 11 264 14 14 1 14 12 13 13 13 264 10 6 5 4 3 7 69 69 69 10 13 13 3 3 3 1 28 12 28 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * RAW - implementation of IP "raw" sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : verify_area() fixed up * Alan Cox : ICMP error handling * Alan Cox : EMSGSIZE if you send too big a packet * Alan Cox : Now uses generic datagrams and shared * skbuff library. No more peek crashes, * no more backlogs * Alan Cox : Checks sk->broadcast. * Alan Cox : Uses skb_free_datagram/skb_copy_datagram * Alan Cox : Raw passes ip options too * Alan Cox : Setsocketopt added * Alan Cox : Fixed error return for broadcasts * Alan Cox : Removed wake_up calls * Alan Cox : Use ttl/tos * Alan Cox : Cleaned up old debugging * Alan Cox : Use new kernel side addresses * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. * Alan Cox : BSD style RAW socket demultiplexing. * Alan Cox : Beginnings of mrouted support. * Alan Cox : Added IP_HDRINCL option. * Alan Cox : Skip broadcast check if BSDism set. * David S. Miller : New socket lookup architecture. */ #include <linux/types.h> #include <linux/atomic.h> #include <asm/byteorder.h> #include <asm/current.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/sockios.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/mroute.h> #include <linux/netdevice.h> #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> #include <linux/igmp.h> #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> #include <linux/ip.h> #include <linux/net.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <net/snmp.h> #include <net/tcp_states.h> #include <net/inet_common.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/uio.h> struct raw_frag_vec { struct msghdr *msg; union { struct icmphdr icmph; char c[1]; } hdr; int hlen; }; struct raw_hashinfo raw_v4_hashinfo; EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *hlist; hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); void raw_unhash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; spin_lock(&h->lock); if (sk_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return true; return false; } EXPORT_SYMBOL_GPL(raw_v4_match); /* * 0 - deliver * 1 - block */ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmphdr _hdr; const struct icmphdr *hdr; hdr = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!hdr) return 1; if (hdr->type < 32) { __u32 data = raw_sk(sk)->filter.data; return ((1U << hdr->type) & data) != 0; } /* Do not block unknown ICMP types */ return 0; } /* IP input processing comes here for RAW socket delivery. * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ static int raw_v4_input(struct net *net, struct sk_buff *skb, const struct iphdr *iph, int hash) { int sdif = inet_sdif(skb); struct hlist_head *hlist; int dif = inet_iif(skb); int delivered = 0; struct sock *sk; hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, skb->dev->ifindex, sdif)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } int raw_local_deliver(struct sk_buff *skb, int protocol) { struct net *net = dev_net(skb->dev); return raw_v4_input(net, skb, ip_hdr(skb), raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; int harderr = 0; bool recverr; int err = 0; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); else if (type == ICMP_REDIRECT) { ipv4_sk_redirect(skb, sk); return; } /* Report error on raw socket, if: 1. User requested ip_recverr. 2. Socket is connected (otherwise the error indication is useless without ip_recverr and error is hard. */ recverr = inet_test_bit(RECVERR, sk); if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: return; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; if (code == ICMP_FRAG_NEEDED) { harderr = inet->pmtudisc != IP_PMTUDISC_DONT; err = EMSGSIZE; } else { err = icmp_err_convert[code].errno; harderr = icmp_err_convert[code].fatal; } } if (recverr) { const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); if (inet_test_bit(HDRINCL, sk)) payload = skb->data; ip_icmp_error(sk, skb, err, 0, info, payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_head *hlist; const struct iphdr *iph; struct sock *sk; int hash; hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->daddr, iph->saddr, dif, sdif)) continue; raw_err(sk, skb, info); } rcu_read_unlock(); } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { kfree_skb_reason(skb, reason); return NET_RX_DROP; } return NET_RX_SUCCESS; } int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); raw_rcv_skb(sk, skb); return 0; } static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, struct rtable **rtp, unsigned int flags, const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; int err; struct rtable *rt = *rtp; int hlen, tlen; if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct iphdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; hlen = LL_RESERVED_SPACE(rt->dst.dev); tlen = rt->dst.dev->needed_tailroom; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, length); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc->tsflags); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_from_msg(iph, msg, length)) goto error_free; iphlen = iph->ihl * 4; /* * We don't want to modify the ip header, but we do need to * be sure that it won't cause problems later along the network * stack. Specifically we want to make sure that iph->ihl is a * sane value. If ihl points beyond the length of the buffer passed * in, reject the frame as invalid */ err = -EINVAL; if (iphlen > length) goto error_free; if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = fl4->saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb->transport_header += iphlen; if (iph->protocol == IPPROTO_ICMP && length >= iphlen + sizeof(struct icmphdr)) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); } err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) goto error; out: return 0; error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; return err; } static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { int err; if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; /* We only need the first two bytes. */ rfv->hlen = 2; err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); if (err) return err; fl4->fl4_icmp_type = rfv->hdr.icmph.type; fl4->fl4_icmp_code = rfv->hdr.icmph.code; return 0; } static int raw_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->hdr.c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->hdr.c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; u8 tos, scope; int free = 0; __be32 daddr; __be32 saddr; int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; hdrincl = inet_test_bit(HDRINCL, sk); /* * Check the flags. */ err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ goto out; /* compatibility */ /* * Get and verify the address. */ if (msg->msg_namelen) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; if (usin->sin_family != AF_INET) { pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", __func__, current->comm); err = -EAFNOSUPPORT; if (usin->sin_family) goto out; } daddr = usin->sin_addr.s_addr; /* ANK: I did not forget to get protocol from port field. * I just do not know, who uses this weirdness. * IP_HDRINCL is much more convenient. */ } else { err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; } ipcm_init_sk(&ipc, inet); /* Keep backward compat */ if (hdrincl) ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; } if (ipc.opt) free = 1; } saddr = ipc.addr; ipc.addr = daddr; if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (ipc.opt) { err = -EINVAL; /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) goto done; daddr = ipc.opt->opt.faddr; } } tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; } else if (!ipc.oif) { ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != inet->uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), inet->uc_index)) { ipc.oif = inet->uc_index; } } flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto done; } err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) goto done; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); else { if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); err = ip_append_data(sk, &fl4, raw_getfrag, &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk, &fl4); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; } release_sock(sk); } done: if (free) kfree(ipc.opt); ip_rt_put(rt); out: if (err < 0) return err; return len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static void raw_close(struct sock *sk, long timeout) { /* * Raw sockets may have direct kernel references. Kill them. */ ip_ra_control(sk, 0, NULL); sk_common_release(sk); } static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); } /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; struct net *net = sock_net(sk); u32 tb_id = RT_TABLE_LOCAL; int ret = -EINVAL; int chk_addr_ret; lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; if (sk->sk_bound_dev_if) tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); ret = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; out: release_sock(sk); return ret; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) { err = ip_recv_error(sk, msg, len, addr_len); goto out; } skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; } static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) { int len, ret = -EFAULT; if (get_user(len, optlen)) goto out; ret = -EINVAL; if (len < 0) goto out; if (len > sizeof(struct icmp_filter)) len = sizeof(struct icmp_filter); ret = -EFAULT; if (put_user(len, optlen) || copy_to_user(optval, &raw_sk(sk)->filter, len)) goto out; ret = 0; out: return ret; } static int do_raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_seticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, level, optname, optval, optlen); } static int do_raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_geticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); return do_raw_getsockopt(sk, level, optname, optval, optlen); } static int raw_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IP_MROUTE return ipmr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IP_MROUTE return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif int raw_abort(struct sock *sk, int err) { lock_sock(sk); sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(raw_abort); struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .bind = raw_bind, .backlog_rcv = raw_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw_sock), .useroffset = offsetof(struct raw_sock, filter), .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static struct sock *raw_get_first(struct seq_file *seq, int bucket) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); struct hlist_head *hlist; struct sock *sk; for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { hlist = &h->ht[state->bucket]; sk_for_each(sk, hlist) { if (sock_net(sk) == seq_file_net(seq)) return sk; } } return NULL; } static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { struct raw_iter_state *state = raw_seq_private(seq); do { sk = sk_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk) return raw_get_first(seq, state->bucket + 1); return sk; } static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = raw_get_first(seq, 0); if (sk) while (pos && (sk = raw_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = raw_get_first(seq, 0); else sk = raw_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_seq_stop); static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr, src = inet->inet_rcv_saddr; __u16 destp = 0, srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int raw_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; } static const struct seq_operations raw_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw_seq_show, }; static __net_init int raw_init_net(struct net *net) { if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops, sizeof(struct raw_iter_state), &raw_v4_hashinfo)) return -ENOMEM; return 0; } static __net_exit void raw_exit_net(struct net *net) { remove_proc_entry("raw", net->proc_net); } static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, }; int __init raw_proc_init(void) { return register_pernet_subsys(&raw_net_ops); } void __init raw_proc_exit(void) { unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ static void raw_sysctl_init_net(struct net *net) { #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_raw_l3mdev_accept = 1; #endif } static int __net_init raw_sysctl_init(struct net *net) { raw_sysctl_init_net(net); return 0; } static struct pernet_operations __net_initdata raw_sysctl_ops = { .init = raw_sysctl_init, }; void __init raw_init(void) { raw_sysctl_init_net(&init_net); if (register_pernet_subsys(&raw_sysctl_ops)) panic("RAW: failed to init sysctl parameters.\n"); } |
| 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement the state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * C. Robin <chris@hundredacre.ac.uk> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/random.h> /* for get_random_bytes */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len); static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; struct sctp_association *asoc = chunk->asoc; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sk_buff *skb = chunk->skb; /* TODO: properly account for control chunks. * To do it right we'll need: * 1) endpoint if association isn't known. * 2) proper memory accounting. * * For now don't do anything for now. */ if (chunk->auth) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } skb->sk = asoc ? asoc->base.sk : NULL; skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { struct sk_buff *skb = chunk->skb; return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of * Explicit Congestion Notification. */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize an op error inside a provided chunk, as most * cause codes will be embedded inside an abort chunk. */ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, size_t paylen) { struct sctp_errhdr err; __u16 len; /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. The format of the INIT chunk is shown below: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initiate Tag | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Advertised Receiver Window Credit (a_rwnd) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of Outbound Streams | Number of Inbound Streams | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initial TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Optional/Variable-Length Parameters / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * The INIT chunk contains the following parameters. Unless otherwise * noted, each parameter MUST only be included once in the INIT chunk. * * Fixed Parameters Status * ---------------------------------------------- * Initiate Tag Mandatory * Advertised Receiver Window Credit Mandatory * Number of Outbound Streams Mandatory * Number of Inbound Streams Mandatory * Initial TSN Mandatory * * Variable Parameters Status Type Value * ------------------------------------------------------------- * IPv4 Address (Note 1) Optional 5 * IPv6 Address (Note 1) Optional 6 * Cookie Preservative Optional 9 * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) * Host Name Address (Note 3) Optional 11 * Supported Address Types (Note 4) Optional 12 */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_supported_addrs_param sat; struct sctp_endpoint *ep = asoc->ep; struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 1: The INIT chunks can contain multiple addresses that * can be IPv4 and/or IPv6 in any combination. */ /* Convert the provided bind address list to raw format. */ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); init.init_tag = htonl(asoc->c.my_vtag); init.a_rwnd = htonl(asoc->rwnd); init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); init.initial_tsn = htonl(asoc->c.initial_tsn); /* How many address types are needed? */ sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); if (asoc->ep->ecn_enable) chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: * An implementation supporting this extension [ADDIP] MUST list * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } chunksize += vparam_len; /* Account for AUTH related parameters */ if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); /* Add HMACS parameter length if any were defined */ auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } /* If we have any extensions to report, account for that */ if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 3: An INIT chunk MUST NOT contain more than one Host * Name address parameter. Moreover, the sender of the INIT * MUST NOT combine any other address types with the Host Name * address in the INIT. The receiver of INIT MUST ignore any * other address types if the Host Name address parameter is * present in the received INIT chunk. * * PLEASE DO NOT FIXME [This version does not support Host Name.] */ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(init), &init); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 4: This parameter, when present, specifies all the * address types the sending endpoint can support. The absence * of this parameter indicates that the sending endpoint can * support any address type. */ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); if (asoc->ep->ecn_enable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } /* Add SCTP-AUTH chunks to the parameter list */ if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } nodata: kfree(addrs.v); return retval; } struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, gfp_t gfp, int unkparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_random = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_chunk *retval = NULL; struct sctp_cookie_param *cookie; struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; int addrs_len; /* Note: there may be no addresses to embed. */ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); initack.init_tag = htonl(asoc->c.my_vtag); initack.a_rwnd = htonl(asoc->rwnd); initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); initack.initial_tsn = htonl(asoc->c.initial_tsn); /* FIXME: We really ought to build the cookie right * into the packet instead of allocating more fresh memory. */ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, addrs.v, addrs_len); if (!cookie) goto nomem_cookie; /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->peer.reconf_capable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * * [INIT ACK back to where the INIT came from.] */ if (chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); sctp_addto_chunk(retval, cookie_len, cookie); if (asoc->peer.ecn_capable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; nomem_chunk: kfree(cookie); nomem_cookie: kfree(addrs.v); return retval; } /* 3.3.11 Cookie Echo (COOKIE ECHO) (10): * * This chunk is used only during the initialization of an association. * It is sent by the initiator of an association to its peer to complete * the initialization process. This chunk MUST precede any DATA chunk * sent within the association, but MAY be bundled with one or more DATA * chunks in the same packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 10 |Chunk Flags | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / Cookie / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bit * * Set to zero on transmit and ignored on receipt. * * Length: 16 bits (unsigned integer) * * Set to the size of the chunk in bytes, including the 4 bytes of * the chunk header and the size of the Cookie. * * Cookie: variable size * * This field must contain the exact cookie received in the * State Cookie parameter from the previous INIT ACK. * * An implementation SHOULD make the cookie as small as possible * to insure interoperability. */ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; int cookie_len; void *cookie; cookie = asoc->peer.cookie; cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = sctp_addto_chunk(retval, cookie_len, cookie); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ECHO back to where the INIT ACK came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): * * This chunk is used only during the initialization of an * association. It is used to acknowledge the receipt of a COOKIE * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent * within the association, but MAY be bundled with one or more DATA * chunks or SACK chunk in the same SCTP packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 11 |Chunk Flags | Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Set to zero on transmit and ignored on receipt. */ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ if (retval && chunk && chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); return retval; } /* * Appendix A: Explicit Congestion Notification: * CWR: * * RFC 2481 details a specific bit for a sender to send in the header of * its next outbound TCP segment to indicate to its peer that it has * reduced its congestion window. This is termed the CWR bit. For * SCTP the same indication is made by including the CWR chunk. * This chunk contains one data element, i.e. the TSN number that * was sent in the ECNE chunk. This element represents the lowest * TSN number in the datagram that was originally marked with the * CE bit. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Lowest TSN Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note: The CWR is considered a Control chunk. */ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; struct sctp_cwrhdr cwr; cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, sizeof(cwr), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecn_cwr_hdr = sctp_addto_chunk(retval, sizeof(cwr), &cwr); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report a reduced congestion window back to where the ECNE * came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Make an ECNE chunk. This is a congestion experienced report. */ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn) { struct sctp_chunk *retval; struct sctp_ecnehdr ecne; ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, sizeof(ecne), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = sctp_addto_chunk(retval, sizeof(ecne), &ecne); nodata: return retval; } /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ memset(&dp, 0, sizeof(dp)); dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); /* Set the flags for an unordered send. */ if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } /* Create a selective ackowledgement (SACK) for the given * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; struct sctp_chunk *retval; struct sctp_sackhdr sack; __u32 ctsn; int len; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn); /* How much room is needed in the chunk? */ num_gabs = sctp_tsnmap_num_gabs(map, gabs); num_dup_tsns = sctp_tsnmap_num_dups(map); /* Initialize the SACK header. */ sack.cum_tsn_ack = htonl(ctsn); sack.a_rwnd = htonl(asoc->a_rwnd); sack.num_gap_ack_blocks = htons(num_gabs); sack.num_dup_tsns = htons(num_dup_tsns); len = sizeof(sack) + sizeof(struct sctp_gap_ack_block) * num_gabs + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk to * which it is replying. This rule should also be followed if * the endpoint is bundling DATA chunks together with the * reply chunk. * * However, when acknowledging multiple DATA chunks received * in packets from different source addresses in a single * SACK, the SACK chunk may be transmitted to one of the * destination transport addresses from which the DATA or * control chunks being acknowledged were received. * * [BUG: We do not implement the following paragraph. * Perhaps we should remember the last transport we used for a * SACK and avoid that (if possible) if we have seen any * duplicates. --piggy] * * When a receiver of a duplicate DATA chunk sends a SACK to a * multi- homed endpoint it MAY be beneficial to vary the * destination address and not use the source address of the * DATA chunk. The reason being that receiving a duplicate * from a multi-homed endpoint might indicate that the return * path (as specified in the source address of the DATA chunk) * for the SACK is broken. * * [Send to the address from which we last received a DATA chunk.] */ retval->transport = asoc->peer.last_data_from; retval->subh.sack_hdr = sctp_addto_chunk(retval, sizeof(sack), &sack); /* Add the gap ack block information. */ if (num_gabs) sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, gabs); /* Add the duplicate TSN information. */ if (num_dup_tsns) { asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } /* Once we have a sack generated, check to see what our sack * generation is, if its 0, reset the transports to 0, and reset * the association generation to 1 * * The idea is that zero is never used as a valid generation for the * association so no transport will match after a wrap event like this, * Until the next sack */ if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; asoc->peer.sack_generation = 1; } nodata: return retval; } /* Make a SHUTDOWN chunk. */ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_shutdownhdr shut; struct sctp_chunk *retval; __u32 ctsn; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, sizeof(shut), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.shutdown_hdr = sctp_addto_chunk(retval, sizeof(shut), &shut); if (chunk) retval->transport = chunk->transport; nodata: return retval; } struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ACK back to where the SHUTDOWN came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association (vtag will be * reflected) */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK * came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Create an ABORT. Note that we set the T bit if we have no * association, except when responding to an INIT (sctpimpguide 2.41). */ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association and 'chunk' is not * an INIT (vtag will be reflected). */ if (!asoc) { if (chunk && chunk->chunk_hdr && chunk->chunk_hdr->type == SCTP_CID_INIT) flags = 0; else flags = SCTP_CHUNK_FLAG_T; } retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Helper to create ABORT with a NO_USER_DATA error. */ struct sctp_chunk *sctp_make_abort_no_data( const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn) { struct sctp_chunk *retval; __be32 payload; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(tsn)); if (!retval) goto no_mem; /* Put the tsn back into network byte order. */ payload = htonl(tsn); sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (chunk) retval->transport = chunk->transport; no_mem: return retval; } /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; void *payload = NULL; int err; retval = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr) + paylen); if (!retval) goto err_chunk; if (paylen) { /* Put the msg_iov together into payload. */ payload = kmalloc(paylen, GFP_KERNEL); if (!payload) goto err_payload; err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); sctp_addto_chunk(retval, paylen, payload); if (paylen) kfree(payload); return retval; err_copy: kfree(payload); err_payload: sctp_chunk_free(retval); retval = NULL; err_chunk: return retval; } /* Append bytes to the end of a parameter. Will panic if chunk is not big * enough. */ static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); void *target; target = skb_put(chunk->skb, len); if (data) memcpy(target, data, len); else memset(target, 0, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen) { struct sctp_chunk *retval; struct sctp_paramhdr phdr; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + paylen + sizeof(phdr)); if (!retval) goto end; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + sizeof(phdr)); phdr.type = htons(chunk->chunk_hdr->type); phdr.length = chunk->chunk_hdr->length; sctp_addto_chunk(retval, paylen, payload); sctp_addto_param(retval, sizeof(phdr), &phdr); end: return retval; } struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param) { static const char error[] = "The following parameter had invalid length:"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) + sizeof(*param); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error) + sizeof(*param)); sctp_addto_chunk(retval, sizeof(error), error); sctp_addto_param(retval, sizeof(*param), param); nodata: return retval; } struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { static const char error[] = "Association exceeded its max_retrans count"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error)); sctp_addto_chunk(retval, sizeof(error), error); nodata: return retval; } struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_new_encap_port_hdr nep; struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(nep)); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; nep.new_port = chunk->transport->encap_port; sctp_addto_chunk(retval, sizeof(nep), &nep); nodata: return retval; } /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size) { struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; hbinfo.param_hdr.length = htons(sizeof(hbinfo)); hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. */ retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); retval->pmtu_probe = !!probe_size; nodata: return retval; } struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [HBACK back to where the HEARTBEAT came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* RFC4820 3. Padding Chunk (PAD) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x84 | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ Padding Data / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); if (!retval) return NULL; skb_put_zero(retval->skb, len); retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ static struct sctp_chunk *sctp_make_op_error_space( const struct sctp_association *asoc, const struct sctp_chunk *chunk, size_t size) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, sizeof(struct sctp_errhdr) + size, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. */ static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { size_t size = SCTP_DEFAULT_MAXSEGMENT; struct sctp_sock *sp = NULL; if (asoc) { size = min_t(size_t, size, asoc->pathmtu); sp = sctp_sk(asoc->base.sk); } size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail) { struct sctp_chunk *retval; retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); if (!retval) goto nodata; sctp_init_cause(retval, cause_code, paylen + reserve_tail); sctp_addto_chunk(retval, paylen, payload); if (reserve_tail) sctp_addto_param(retval, reserve_tail, NULL); nodata: return retval; } struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (unlikely(!hmac_desc)) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, hmac_desc->hmac_len + sizeof(auth_hdr), GFP_ATOMIC); if (!retval) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Turn an skb into a chunk. * FIXME: Eventually move the structure directly inside the skb->cb[]. * * sctpimpguide-05.txt Section 2.8.2 * M1) Each time a new DATA chunk is transmitted * set the 'TSN.Missing.Report' count for that TSN to 0. The * 'TSN.Missing.Report' count will be used to determine missing chunks * and when to fast retransmit. * */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; if (!sk) pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb); INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->singleton = 1; retval->fast_retransmit = SCTP_CAN_FRTX; /* Polish the bead hole. */ INIT_LIST_HEAD(&retval->transmitted_list); INIT_LIST_HEAD(&retval->frag_list); SCTP_DBG_OBJCNT_INC(chunk); refcount_set(&retval->refcnt, 1); nodata: return retval; } /* Set chunk->source and dest based on the IP header in chunk->skb. */ void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, union sctp_addr *dest) { memcpy(&chunk->source, src, sizeof(union sctp_addr)); memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); } /* Extract the source address from a chunk. */ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) { /* If we have a known transport, use that. */ if (chunk->transport) { return &chunk->transport->ipaddr; } else { /* Otherwise, extract it from the IP header. */ return &chunk->source; } } /* Create a new chunk, setting the type and flags headers from the * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunkhdr *chunk_hdr; struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; int chunklen; chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); if (chunklen > SCTP_MAX_CHUNK_LEN) goto nodata; /* No need to allocate LL here, as this is only a chunk. */ skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; /* Make room for the chunk header. */ chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr)); chunk_hdr->type = type; chunk_hdr->flags = flags; chunk_hdr->length = htons(sizeof(*chunk_hdr)); sk = asoc ? asoc->base.sk : NULL; retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; } retval->chunk_hdr = chunk_hdr; retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr); /* Determine if the chunk needs to be authenticated */ if (sctp_auth_send_cid(type, asoc)) retval->auth = 1; return retval; nodata: return NULL; } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunk *chunk; chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); return chunk; } /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); consume_skb(chunk->skb); consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); } /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); sctp_chunk_put(chunk); } /* Grab a reference to the chunk. */ void sctp_chunk_hold(struct sctp_chunk *ch) { refcount_inc(&ch->refcnt); } /* Release a reference to the chunk. */ void sctp_chunk_put(struct sctp_chunk *ch) { if (refcount_dec_and_test(&ch->refcnt)) sctp_chunk_destroy(ch); } /* Append bytes to the end of a chunk. Will panic if chunk is not big * enough. */ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); int padlen = SCTP_PAD4(chunklen) - chunklen; void *target; skb_put_zero(chunk->skb, padlen); target = skb_put_data(chunk->skb, data, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. */ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return 0; } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; struct sctp_datamsg *msg; __u16 ssn, sid; if (chunk->has_ssn) return; /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. */ msg = chunk->msg; list_for_each_entry(lchunk, &msg->chunks, frag_list) { if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) ssn = sctp_ssn_next(stream, out, sid); else ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); lchunk->has_ssn = 1; } } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) { if (!chunk->has_tsn) { /* This is the last possible instant to * assign a TSN. */ chunk->subh.data_hdr->tsn = htonl(sctp_association_get_next_tsn(chunk->asoc)); chunk->has_tsn = 1; } } /* Create a CLOSED association to use with an incoming packet. */ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc; enum sctp_scope scope; struct sk_buff *skb; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!asoc) goto nodata; asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); nodata: return asoc; } /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len) { struct sctp_signed_cookie *cookie; struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_paramhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = sizeof(struct sctp_cookie) + ntohs(init_chunk->chunk_hdr->length) + addrs_len; /* Pad out the cookie to a multiple to make the signature * functions simpler to write. */ if (bodysize % SCTP_COOKIE_MULTIPLE) bodysize += SCTP_COOKIE_MULTIPLE - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; /* Clear this memory since we are sending this data structure * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); if (!retval) goto nodata; cookie = (struct sctp_signed_cookie *) retval->body; /* Set up the parameter header. */ retval->p.type = SCTP_PARAM_STATE_COOKIE; retval->p.length = htons(*cookie_len); /* Copy the cookie part of the association itself. */ cookie->c = asoc->c; /* Save the raw address list length in the cookie. */ cookie->c.raw_addr_list_len = addrs_len; /* Remember PR-SCTP capability. */ cookie->c.prsctp_capable = asoc->peer.prsctp_capable; /* Save adaptation indication in the cookie. */ cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, ktime_get_real()); /* Copy the peer's init packet. */ memcpy(cookie + 1, init_chunk->chunk_hdr, ntohs(init_chunk->chunk_hdr->length)); /* Copy the raw local address list of the association. */ memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); if (sctp_sk(ep->base.sk)->hmac) { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; /* Sign the message. */ err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, cookie->signature); if (err) goto free_cookie; } return retval; free_cookie: kfree(retval); nodata: *cookie_len = 0; return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *error, struct sctp_chunk **errp) { struct sctp_association *retval = NULL; int headersize, bodysize, fixed_size; struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_chunkhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = ntohs(chunk->chunk_hdr->length) - headersize; fixed_size = headersize + sizeof(struct sctp_cookie); /* Verify that the chunk looks like it even has a cookie. * There must be enough room for our cookie and our peer's * INIT chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len < fixed_size + sizeof(struct sctp_chunkhdr)) goto malformed; /* Verify that the cookie has been padded out. */ if (bodysize % SCTP_COOKIE_MULTIPLE) goto malformed; /* Process the cookie. */ cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; if (!sctp_sk(ep->base.sk)->hmac) goto no_hmac; /* Check the signature. */ { struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; int err; err = crypto_shash_setkey(tfm, ep->secret_key, sizeof(ep->secret_key)) ?: crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, digest); if (err) { *error = -SCTP_IERROR_NOMEM; goto fail; } } if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the * verification tag within the SCTP common header of the received * packet. If these values do not match the packet MUST be silently * discarded, */ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { *error = -SCTP_IERROR_BAD_TAG; goto fail; } if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { *error = -SCTP_IERROR_BAD_PORTS; goto fail; } /* Check to see if the cookie is stale. If there is already * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); /* * Section 3.3.10.3 Stale Cookie Error (3) * * Cause of error * --------------- * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_STALE_COOKIE, &n, sizeof(n), 0); if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; else *error = -SCTP_IERROR_NOMEM; goto fail; } /* Make a new base association. */ scope = sctp_scope(sctp_source(chunk)); retval = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!retval) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Set up our peer's port number. */ retval->peer.port = ntohs(chunk->sctp_hdr->source); /* Populate the association from the cookie. */ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, GFP_ATOMIC) < 0) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, sizeof(chunk->dest), SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; /* The INIT stuff will be done by the side effects. */ return retval; fail: if (retval) sctp_association_free(retval); return NULL; malformed: /* Yikes! The packet is either corrupt or deliberately * malformed. */ *error = -SCTP_IERROR_MALFORMED; goto fail; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ struct __sctp_missing { __be32 num_missing; __be16 type; } __packed; /* * Report a missing mandatory parameter. */ static int sctp_process_missing_param(const struct sctp_association *asoc, enum sctp_param paramtype, struct sctp_chunk *chunk, struct sctp_chunk **errp) { struct __sctp_missing report; __u16 len; len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { report.num_missing = htonl(1); report.type = paramtype; sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, sizeof(report)); sctp_addto_chunk(*errp, sizeof(report), &report); } /* Stop processing this chunk. */ return 0; } /* Report an Invalid Mandatory Parameter. */ static int sctp_process_inv_mandatory(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* Invalid Mandatory Parameter Error has no payload. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, 0); if (*errp) sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); /* Stop processing this chunk. */ return 0; } static int sctp_process_inv_paramlength(const struct sctp_association *asoc, struct sctp_paramhdr *param, const struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* This is a fatal error. Any accumulated non-fatal errors are * not reported. */ if (*errp) sctp_chunk_free(*errp); /* Create an error chunk and fill it in with our payload. */ *errp = sctp_make_violation_paramlen(asoc, chunk, param); return 0; } /* Do not attempt to handle the HOST_NAME parm. However, do * send back an indicator to the peer. */ static int sctp_process_hn_param(const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { __u16 len = ntohs(param.p->length); /* Processing of the HOST_NAME parameter will generate an * ABORT. If we've accumulated any non-fatal errors, they * would be unrecognized parameters and we should not include * them in the ABORT. */ if (*errp) sctp_chunk_free(*errp); *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, param.v, len, 0); /* Stop processing this chunk. */ return 0; } static int sctp_verify_ext_param(struct net *net, const struct sctp_endpoint *ep, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; int have_auth = 0; int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_AUTH: have_auth = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: have_asconf = 1; break; } } /* ADD-IP Security: The draft requires us to ABORT or ignore the * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this * only if ADD-IP is turned on and we are not backward-compatible * mode. */ if (net->sctp.addip_noauth) return 1; if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; } static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he * supports AUTH. */ if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: if (asoc->ep->intl_enable) asoc->peer.intl_capable = 1; break; default: break; } } } /* RFC 3.2.1 & the Implementers Guide 2.2. * * The Parameter Types are encoded such that the * highest-order two bits specify the action that must be * taken if the processing endpoint does not recognize the * Parameter Type. * * 00 - Stop processing this parameter; do not process any further * parameters within this chunk * * 01 - Stop processing this parameter, do not process any further * parameters within this chunk, and report the unrecognized * parameter in an 'Unrecognized Parameter' ERROR chunk. * * 10 - Skip this parameter and continue processing. * * 11 - Skip this parameter and continue processing but * report the unrecognized parameter in an * 'Unrecognized Parameter' ERROR chunk. * * Return value: * SCTP_IERROR_NO_ERROR - continue with the chunk * SCTP_IERROR_ERROR - stop and report an error. * SCTP_IERROR_NOMEME - out of memory. */ static enum sctp_ierror sctp_process_unk_param( const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { int retval = SCTP_IERROR_NO_ERROR; switch (param.p->type & SCTP_PARAM_ACTION_MASK) { case SCTP_PARAM_ACTION_DISCARD: retval = SCTP_IERROR_ERROR; break; case SCTP_PARAM_ACTION_SKIP: break; case SCTP_PARAM_ACTION_DISCARD_ERR: retval = SCTP_IERROR_ERROR; fallthrough; case SCTP_PARAM_ACTION_SKIP_ERR: /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) { *errp = sctp_make_op_error_limited(asoc, chunk); if (!*errp) { /* If there is no memory for generating the * ERROR report as specified, an ABORT will be * triggered to the peer and the association * won't be established. */ retval = SCTP_IERROR_NOMEM; break; } } if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, ntohs(param.p->length))) sctp_addto_chunk(*errp, ntohs(param.p->length), param.v); break; default: break; } return retval; } /* Verify variable length parameters * Return values: * SCTP_IERROR_ABORT - trigger an ABORT * SCTP_IERROR_NOMEM - out of memory (abort) * SCTP_IERROR_ERROR - stop processing, trigger an ERROR * SCTP_IERROR_NO_ERROR - continue with the chunk */ static enum sctp_ierror sctp_verify_param(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, enum sctp_cid cid, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk) { struct sctp_hmac_algo_param *hmacs; int retval = SCTP_IERROR_NO_ERROR; __u16 n_elt, id = 0; int i; /* FIXME - This routine is not looking at each parameter per the * chunk type, i.e., unrecognized parameters should be further * identified based on the chunk id. */ switch (param.p->type) { case SCTP_PARAM_IPV4_ADDRESS: case SCTP_PARAM_IPV6_ADDRESS: case SCTP_PARAM_COOKIE_PRESERVATIVE: case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: case SCTP_PARAM_STATE_COOKIE: case SCTP_PARAM_HEARTBEAT_INFO: case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: break; case SCTP_PARAM_SUPPORTED_EXT: if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto unhandled; if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* This param has been Deprecated, send ABORT. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); retval = SCTP_IERROR_ABORT; break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or * INIT-ACK chunk if the sender wants to receive authenticated * chunks. Its maximum length is 260 bytes. */ if (260 < ntohs(param.p->length)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) >> 1; /* SCTP-AUTH: Section 6.1 * The HMAC algorithm based on SHA-1 MUST be supported and * included in the HMAC-ALGO parameter. */ for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (id == SCTP_AUTH_HMAC_ID_SHA1) break; } if (id != SCTP_AUTH_HMAC_ID_SHA1) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); break; } return retval; } /* Verify the INIT packet before we process it. */ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; bool has_cookie = false; int result; /* Check for missing mandatory parameters. Note: Initial TSN is * also mandatory, but is not checked here since the valid range * is 0..2**32-1. RFC4960, section 3.3.3. */ if (peer_init->init_hdr.num_outbound_streams == 0 || peer_init->init_hdr.num_inbound_streams == 0 || peer_init->init_hdr.init_tag == 0 || ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW) return sctp_process_inv_mandatory(asoc, chunk, errp); sctp_walk_params(param, peer_init) { if (param.p->type == SCTP_PARAM_STATE_COOKIE) has_cookie = true; } /* There is a possibility that a parameter length was bad and * in that case we would have stoped walking the parameters. * The current param.p would point at the bad one. * Current consensus on the mailing list is to generate a PROTOCOL * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is * the state cookie for an INIT-ACK chunk. */ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, chunk, errp); /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init) { result = sctp_verify_param(net, ep, asoc, param, cid, chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: return 0; case SCTP_IERROR_ERROR: return 1; case SCTP_IERROR_NO_ERROR: default: break; } } /* for (loop through all parameters) */ return 1; } /* Unpack the parameters in an INIT packet into an association. * Returns 0 on failure, else success. * FIXME: This is an association method. */ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; union sctp_addr addr; struct sctp_af *af; int src_match = 0; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. * When processing a COOKIE ECHO, we retrieve the from address * of the INIT from the cookie. */ /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) src_match = 1; /* Process the initialization parameters. */ sctp_walk_params(param, peer_init) { if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, chunk->sctp_hdr->source, 0)) continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } if (!sctp_process_param(asoc, param, peer_addr, gfp)) goto clean_up; } /* source address of chunk may not match any valid address */ if (!src_match) goto clean_up; /* AUTH: After processing the parameters, make sure that we * have all the required info to potentially do authentications. */ if (asoc->peer.auth_capable && (!asoc->peer.peer_random || !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; /* In a non-backward compatible mode, if the peer claims * support for ADD-IP but not AUTH, the ADD-IP spec states * that we MUST ABORT the association. Section 6. The section * also give us an option to silently ignore the packet, which * is what we'll do here. */ if (!asoc->base.net->sctp.addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state == SCTP_UNKNOWN) { sctp_assoc_rm_peer(asoc, transport); } } /* The fixed INIT headers are always in network byte * order. */ asoc->peer.i.init_tag = ntohl(peer_init->init_hdr.init_tag); asoc->peer.i.a_rwnd = ntohl(peer_init->init_hdr.a_rwnd); asoc->peer.i.num_outbound_streams = ntohs(peer_init->init_hdr.num_outbound_streams); asoc->peer.i.num_inbound_streams = ntohs(peer_init->init_hdr.num_inbound_streams); asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); asoc->strreset_inseq = asoc->peer.i.initial_tsn; /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ if (asoc->c.sinit_num_ostreams > ntohs(peer_init->init_hdr.num_inbound_streams)) { asoc->c.sinit_num_ostreams = ntohs(peer_init->init_hdr.num_inbound_streams); } if (asoc->c.sinit_max_instreams > ntohs(peer_init->init_hdr.num_outbound_streams)) { asoc->c.sinit_max_instreams = ntohs(peer_init->init_hdr.num_outbound_streams); } /* Copy Initiation tag from INIT to VT_peer in cookie. */ asoc->c.peer_vtag = asoc->peer.i.init_tag; /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { transport->ssthresh = asoc->peer.i.a_rwnd; } /* Set up the TSN tracking pieces. */ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, gfp)) goto clean_up; /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number * * The stream sequence number in all the streams shall start * from 0 when the association is established. Also, when the * stream sequence number reaches the value 65535 the next * stream sequence number shall be set to 0. */ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, gfp)) goto clean_up; /* Update frag_point when stream_interleave may get changed. */ sctp_assoc_update_frag_point(asoc); if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) A serial number should be assigned to the Chunk. The serial * number should be a monotonically increasing number. All serial * numbers are defined to be initialized at the start of the * association to the same value as the Initial TSN. */ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; return 1; clean_up: /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state != SCTP_ACTIVE) sctp_assoc_rm_peer(asoc, transport); } nomem: return 0; } /* Update asoc with the option described in param. * * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT * * asoc is the association to update. * param is the variable length parameter to use for update. * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. * If the current packet is an INIT we want to minimize the amount of * work we do. In particular, we should not build transport * structures for the addresses. */ static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp) { struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; struct sctp_af *af; int retval = 1, i; u32 stale; __u16 sat; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters * came from a fresh INIT, and INIT ACK, or were stored in a cookie. */ switch (param.p->type) { case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 != asoc->base.sk->sk_family) break; goto do_addr_param; case SCTP_PARAM_IPV4_ADDRESS: /* v4 addresses are not allowed on v6-only socket */ if (ipv6_only_sock(asoc->base.sk)) break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) return 0; break; case SCTP_PARAM_COOKIE_PRESERVATIVE: if (!net->sctp.cookie_preserve_enable) break; stale = ntohl(param.life->lifespan_increment); /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: /* Turn off the default values first so we'll know which * ones are really set by the peer. */ asoc->peer.ipv4_address = 0; asoc->peer.ipv6_address = 0; /* Assume that peer supports the address family * by which it sends a packet. */ if (peer_addr->sa.sa_family == AF_INET6) asoc->peer.ipv6_address = 1; else if (peer_addr->sa.sa_family == AF_INET) asoc->peer.ipv4_address = 1; /* Cycle through address types; avoid divide by 0. */ sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); if (sat) sat /= sizeof(__u16); for (i = 0; i < sat; ++i) { switch (param.sat->types[i]) { case SCTP_PARAM_IPV4_ADDRESS: asoc->peer.ipv4_address = 1; break; case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 == asoc->base.sk->sk_family) asoc->peer.ipv6_address = 1; break; default: /* Just ignore anything else. */ break; } } break; case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: /* Would be odd to receive, but it causes no problems. */ break; case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: /* Rejected during verify stage. */ break; case SCTP_PARAM_ECN_CAPABLE: if (asoc->ep->ecn_enable) { asoc->peer.ecn_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af) break; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) break; if (!af->addr_valid(&addr, NULL, NULL)) break; t = sctp_assoc_lookup_paddr(asoc, &addr); if (!t) break; sctp_assoc_set_primary(asoc, t); break; case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { retval = 0; break; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { retval = 0; break; } /* Set the default HMAC the peer requested*/ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto fall_through; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) retval = 0; break; fall_through: default: /* Any unrecognized parameters should have been caught * and handled by sctp_verify_param() which should be * called prior to this routine. Simply log the error * here. */ pr_debug("%s: ignoring param:%d for association:%p.\n", __func__, ntohs(param.p->type), asoc); break; } return retval; } /* Select a new verification tag. */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep) { /* I believe that this random number generator complies with RFC1750. * A tag of 0 is reserved for special cases (e.g. INIT). */ __u32 x; do { get_random_bytes(&x, sizeof(__u32)); } while (x == 0); return x; } /* Select an initial TSN to send during startup. */ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep) { __u32 retval; get_random_bytes(&retval, sizeof(__u32)); return retval; } /* * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Address Parameter and other parameter will not be wrapped in this function */ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, union sctp_addr *addr, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; union sctp_addr_param addrparam; int addrlen; struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; length += addrlen; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(asoc->addip_serial++); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); retval->param_hdr.v = sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP * 3.2.1 Add IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC001 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * 3.2.2 Delete IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC002 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * */ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags) { union sctp_addr_param addr_param; struct sctp_addip_param param; int paramlen = sizeof(param); struct sctp_chunk *retval; int addr_param_len = 0; union sctp_addr *addr; int totallen = 0, i; int del_pickup = 0; struct sctp_af *af; void *addr_buf; /* Get total length of all the address parameters. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); totallen += paramlen; totallen += addr_param_len; addr_buf += af->sockaddr_len; if (asoc->asconf_addr_del_pending && !del_pickup) { /* reuse the parameter length from the same scope one */ totallen += paramlen; totallen += addr_param_len; del_pickup = 1; pr_debug("%s: picked same-scope del_pending addr, " "totallen for all addresses is %d\n", __func__, totallen); } } /* Create an asconf chunk with the required length. */ retval = sctp_make_asconf(asoc, laddr, totallen); if (!retval) return NULL; /* Add the address parameters to the asconf chunk. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); addr_buf += af->sockaddr_len; } if (flags == SCTP_PARAM_ADD_IP && del_pickup) { addr = asoc->asconf_addr_del_pending; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); } return retval; } /* ADDIP * 3.2.4 Set Primary IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type =0xC004 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF chunk with Set Primary IP address parameter. */ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr) { struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); union sctp_addr_param addrparam; struct sctp_addip_param param; struct sctp_chunk *retval; int len = sizeof(param); int addrlen; addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; len += addrlen; /* Create the chunk and make asconf header. */ retval = sctp_make_asconf(asoc, addr, len); if (!retval) return NULL; param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; param.param_hdr.length = htons(len); param.crr_id = 0; sctp_addto_chunk(retval, sizeof(param), ¶m); sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x80 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF_ACK chunk with enough space for the parameter responses. */ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, __u32 serial, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(serial); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); return retval; } /* Add response parameters to an ASCONF_ACK chunk. */ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, __be16 err_code, struct sctp_addip_param *asconf_param) { struct sctp_addip_param ack_param; struct sctp_errhdr err_param; int asconf_param_len = 0; int err_param_len = 0; __be16 response_type; if (SCTP_ERROR_NO_ERROR == err_code) { response_type = SCTP_PARAM_SUCCESS_REPORT; } else { response_type = SCTP_PARAM_ERR_CAUSE; err_param_len = sizeof(err_param); if (asconf_param) asconf_param_len = ntohs(asconf_param->param_hdr.length); } /* Add Success Indication or Error Cause Indication parameter. */ ack_param.param_hdr.type = response_type; ack_param.param_hdr.length = htons(sizeof(ack_param) + err_param_len + asconf_param_len); ack_param.crr_id = crr_id; sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); if (SCTP_ERROR_NO_ERROR == err_code) return; /* Add Error Cause parameter. */ err_param.cause = err_code; err_param.length = htons(err_param_len + asconf_param_len); sctp_addto_chunk(chunk, err_param_len, &err_param); /* Add the failed TLV copied from ASCONF chunk. */ if (asconf_param) sctp_addto_chunk(chunk, asconf_param_len, asconf_param); } /* Process a asconf parameter. */ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_chunk *asconf, struct sctp_addip_param *asconf_param) { union sctp_addr_param *addr_param; struct sctp_transport *peer; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) return SCTP_ERROR_UNKNOWN_PARAM; switch (addr_param->p.type) { case SCTP_PARAM_IPV6_ADDRESS: if (!asoc->peer.ipv6_address) return SCTP_ERROR_DNS_FAILED; break; case SCTP_PARAM_IPV4_ADDRESS: if (!asoc->peer.ipv4_address) return SCTP_ERROR_DNS_FAILED; break; default: return SCTP_ERROR_DNS_FAILED; } af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. * (note: wildcard is permitted and requires special handling so * make sure we check for that) */ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) return SCTP_ERROR_DNS_FAILED; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* Section 4.2.1: * If the address 0.0.0.0 or ::0 is provided, the source * address of the packet MUST be added. */ if (af->is_any(&addr)) memcpy(&addr, &asconf->source, sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_ADD_IP, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error * Cause TLV set to the new error code 'Operation Refused * Due to Resource Shortage'. */ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); if (!peer) return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: /* ADDIP 4.3 D7) If a request is received to delete the * last remaining IP address of a peer endpoint, the receiver * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP * address which is also the source address of the IP packet * which contained the ASCONF chunk, the receiver MUST reject * this request. To reject the request the receiver MUST send * an Error Cause TLV set to the new error code 'Request to * Delete Source IP Address' */ if (sctp_cmp_addr_exact(&asconf->source, &addr)) return SCTP_ERROR_DEL_SRC_IP; /* Section 4.2.2 * If the address 0.0.0.0 or ::0 is provided, all * addresses of the peer except the source address of the * packet MUST be deleted. */ if (af->is_any(&addr)) { sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); return SCTP_ERROR_NO_ERROR; } /* If the address is not part of the association, the * ASCONF-ACK with Error Cause Indication Parameter * which including cause of Unresolvable Address should * be sent. */ peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 * If the address 0.0.0.0 or ::0 is provided, the receiver * MAY mark the source address of the packet as its * primary. */ if (af->is_any(&addr)) memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_set_primary(asoc, peer); break; } return SCTP_ERROR_NO_ERROR; } /* Verify the ASCONF packet before we process it. */ bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp) { struct sctp_addip_chunk *addip; bool addr_param_seen = false; union sctp_params param; addip = (struct sctp_addip_chunk *)chunk->chunk_hdr; sctp_walk_params(param, addip) { size_t length = ntohs(param.p->length); *errp = param.p; switch (param.p->type) { case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. */ if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: /* In ASCONF chunks, these need to be first. */ if (addr_param_needed && !addr_param_seen) return false; length = ntohs(param.addip->param_hdr.length); if (length < sizeof(struct sctp_addip_param) + sizeof(**errp)) return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(struct sctp_addip_param)) return false; break; default: /* This is unknown to us, reject! */ return false; } } /* Remaining sanity checks. */ if (addr_param_needed && !addr_param_seen) return false; if (!addr_param_needed && addr_param_seen) return false; if (param.v != chunk->chunk_end) return false; return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and * return an ASCONF_ACK chunk to be sent in response. */ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { union sctp_addr_param *addr_param; struct sctp_addip_chunk *addip; struct sctp_chunk *asconf_ack; bool all_param_pass = true; struct sctp_addiphdr *hdr; int length = 0, chunk_len; union sctp_params param; __be16 err_code; __u32 serial; addip = (struct sctp_addip_chunk *)asconf->chunk_hdr; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); hdr = (struct sctp_addiphdr *)asconf->skb->data; serial = ntohl(hdr->serial); /* Skip the addiphdr and store a pointer to address parameter. */ length = sizeof(*hdr); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); chunk_len -= length; /* Skip the address parameter and store a pointer to the first * asconf parameter. */ length = ntohs(addr_param->p.length); chunk_len -= length; /* create an ASCONF_ACK chunk. * Based on the definitions of parameters, we know that the size of * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF * parameters. */ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); if (!asconf_ack) goto done; /* Process the TLVs contained within the ASCONF chunk. */ sctp_walk_params(param, addip) { /* Skip preceeding address parameters. */ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS) continue; err_code = sctp_process_asconf_param(asoc, asconf, param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are * considered successful if not reported. All TLVs after * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ if (err_code != SCTP_ERROR_NO_ERROR) all_param_pass = false; if (!all_param_pass) sctp_add_asconf_response(asconf_ack, param.addip->crr_id, err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ if (err_code == SCTP_ERROR_RSRC_LOW) goto done; } done: asoc->peer.addip_serial++; /* If we are sending a new ASCONF_ACK hold a reference to it in assoc * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { sctp_chunk_hold(asconf_ack); list_add_tail(&asconf_ack->transmitted_list, &asoc->asconf_ack_list); } return asconf_ack; } /* Process a asconf parameter that is successfully acked. */ static void sctp_asconf_param_success(struct sctp_association *asoc, struct sctp_addip_param *asconf_param) { struct sctp_bind_addr *bp = &asoc->base.bind_addr; union sctp_addr_param *addr_param; struct sctp_sockaddr_entry *saddr; struct sctp_transport *transport; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* This is always done in BH context with a socket lock * held, so the list can not change. */ local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: local_bh_disable(); sctp_del_bind_addr(bp, &addr); if (asoc->asconf_addr_del_pending != NULL && sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) { kfree(asoc->asconf_addr_del_pending); asoc->asconf_addr_del_pending = NULL; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; default: break; } } /* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk * for the given asconf parameter. If there is no response for this parameter, * return the error code based on the third argument 'no_err'. * ADDIP 4.1 * A7) If an error response is received for a TLV parameter, all TLVs with no * response before the failed TLV are considered successful if not reported. * All TLVs after the failed response are considered unsuccessful unless a * specific success indication is present for the parameter. */ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, struct sctp_addip_param *asconf_param, int no_err) { struct sctp_addip_param *asconf_ack_param; struct sctp_errhdr *err_param; int asconf_ack_len; __be16 err_code; int length; if (no_err) err_code = SCTP_ERROR_NO_ERROR; else err_code = SCTP_ERROR_REQ_REFUSED; asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); /* Skip the addiphdr from the asconf_ack chunk and store a pointer to * the first asconf_ack parameter. */ length = sizeof(struct sctp_addiphdr); asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data + length); asconf_ack_len -= length; while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: length = sizeof(*asconf_ack_param); err_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; if (asconf_ack_len > 0) return err_param->cause; else return SCTP_ERROR_INV_PARAM; break; default: return SCTP_ERROR_INV_PARAM; } } length = ntohs(asconf_ack_param->param_hdr.length); asconf_ack_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; } return err_code; } /* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack) { struct sctp_chunk *asconf = asoc->addip_last_asconf; struct sctp_addip_param *asconf_param; __be16 err_code = SCTP_ERROR_NO_ERROR; union sctp_addr_param *addr_param; int asconf_len = asconf->skb->len; int all_param_pass = 0; int length = 0; int no_err = 1; int retval = 0; /* Skip the chunkhdr and addiphdr from the last asconf sent and store * a pointer to address parameter. */ length = sizeof(struct sctp_addip_chunk); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); asconf_len -= length; /* Skip the address parameter in the last asconf sent and store a * pointer to the first asconf parameter. */ length = ntohs(addr_param->p.length); asconf_param = (void *)addr_param + length; asconf_len -= length; /* ADDIP 4.1 * A8) If there is no response(s) to specific TLV parameter(s), and no * failures are indicated, then all request(s) are considered * successful. */ if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr)) all_param_pass = 1; /* Process the TLVs contained in the last sent ASCONF chunk. */ while (asconf_len > 0) { if (all_param_pass) err_code = SCTP_ERROR_NO_ERROR; else { err_code = sctp_get_asconf_response(asconf_ack, asconf_param, no_err); if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) no_err = 0; } switch (err_code) { case SCTP_ERROR_NO_ERROR: sctp_asconf_param_success(asoc, asconf_param); break; case SCTP_ERROR_RSRC_LOW: retval = 1; break; case SCTP_ERROR_UNKNOWN_PARAM: /* Disable sending this type of asconf parameter in * future. */ asoc->peer.addip_disabled_mask |= asconf_param->param_hdr.type; break; case SCTP_ERROR_REQ_REFUSED: case SCTP_ERROR_DEL_LAST_IP: case SCTP_ERROR_DEL_SRC_IP: default: break; } /* Skip the processed asconf parameter and move to the next * one. */ length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; asconf_len -= length; } if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; sctp_transport_immediate_rtx(asoc->peer.primary_path); } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; return retval; } /* Make a FWD TSN chunk. */ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_fwdtsn_hdr ftsn_hdr; struct sctp_fwdtsn_skip skip; size_t hint; int i; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.fwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); for (i = 0; i < nstreams; i++) { skip.stream = skiplist[i].stream; skip.ssn = skiplist[i].ssn; sctp_addto_chunk(retval, sizeof(skip), &skip); } return retval; } struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_ifwdtsn_hdr ftsn_hdr; size_t hint; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.ifwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); return retval; } /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 130 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter (optional) / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, int length) { struct sctp_reconf_chunk *reconf; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; retval->param_hdr.v = (u8 *)(reconf + 1); return retval; } /* RE-CONFIG 4.1 (STREAM OUT RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Last Assigned TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * RE-CONFIG 4.2 (STREAM IN RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in) { __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; if (outlen) { outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; outreq.param_hdr.length = htons(outlen); outreq.request_seq = htonl(asoc->strreset_outseq); outreq.response_seq = htonl(asoc->strreset_inseq - 1); outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); sctp_addto_chunk(retval, sizeof(outreq), &outreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } if (inlen) { inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; inreq.param_hdr.length = htons(inlen); inreq.request_seq = htonl(asoc->strreset_outseq + out); sctp_addto_chunk(retval, sizeof(inreq), &inreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } return retval; } /* RE-CONFIG 4.3 (SSN/TSN RESET ALL) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 15 | Parameter Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc) { struct sctp_strreset_tsnreq tsnreq; __u16 length = sizeof(tsnreq); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; tsnreq.param_hdr.length = htons(length); tsnreq.request_seq = htonl(asoc->strreset_outseq); sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); return retval; } /* RE-CONFIG 4.5/4.6 (ADD STREAM) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 17 | Parameter Length = 12 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of new streams | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in) { struct sctp_strreset_addstrm addstrm; __u16 size = sizeof(addstrm); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, (!!out + !!in) * size); if (!retval) return NULL; if (out) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(out); addstrm.request_seq = htonl(asoc->strreset_outseq); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } if (in) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(in); addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } return retval; } /* RE-CONFIG 4.4 (RESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn) { struct sctp_strreset_resp resp; __u16 length = sizeof(resp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; resp.param_hdr.length = htons(length); resp.response_seq = htonl(sn); resp.result = htonl(result); sctp_addto_chunk(retval, sizeof(resp), &resp); return retval; } /* RE-CONFIG 4.4 OPTIONAL (TSNRESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Receiver's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn) { struct sctp_strreset_resptsn tsnresp; __u16 length = sizeof(tsnresp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; tsnresp.param_hdr.length = htons(length); tsnresp.response_seq = htonl(sn); tsnresp.result = htonl(result); tsnresp.senders_next_tsn = htonl(sender_tsn); tsnresp.receivers_next_tsn = htonl(receiver_tsn); sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); return retval; } bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp) { struct sctp_reconf_chunk *hdr; union sctp_params param; __be16 last = 0; __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { __u16 length = ntohs(param.p->length); *errp = param.p; if (cnt++ > 2) return false; switch (param.p->type) { case SCTP_PARAM_RESET_OUT_REQUEST: if (length < sizeof(struct sctp_strreset_outreq) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_IN_REQUEST)) return false; break; case SCTP_PARAM_RESET_IN_REQUEST: if (length < sizeof(struct sctp_strreset_inreq) || (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_RESPONSE: if ((length != sizeof(struct sctp_strreset_resp) && length != sizeof(struct sctp_strreset_resptsn)) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_TSN_REQUEST: if (length != sizeof(struct sctp_strreset_tsnreq) || last) return false; break; case SCTP_PARAM_RESET_ADD_IN_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) return false; break; case SCTP_PARAM_RESET_ADD_OUT_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) return false; break; default: return false; } last = param.p->type; } return true; } |
| 67 70 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #ifndef __DRM_PLANE_H__ #define __DRM_PLANE_H__ #include <linux/list.h> #include <linux/ctype.h> #include <drm/drm_mode_object.h> #include <drm/drm_color_mgmt.h> #include <drm/drm_rect.h> #include <drm/drm_modeset_lock.h> #include <drm/drm_util.h> struct drm_crtc; struct drm_printer; struct drm_modeset_acquire_ctx; enum drm_scaling_filter { DRM_SCALING_FILTER_DEFAULT, DRM_SCALING_FILTER_NEAREST_NEIGHBOR, }; /** * struct drm_plane_state - mutable plane state * * Please note that the destination coordinates @crtc_x, @crtc_y, @crtc_h and * @crtc_w and the source coordinates @src_x, @src_y, @src_h and @src_w are the * raw coordinates provided by userspace. Drivers should use * drm_atomic_helper_check_plane_state() and only use the derived rectangles in * @src and @dst to program the hardware. */ struct drm_plane_state { /** @plane: backpointer to the plane */ struct drm_plane *plane; /** * @crtc: * * Currently bound CRTC, NULL if disabled. Do not write this directly, * use drm_atomic_set_crtc_for_plane() */ struct drm_crtc *crtc; /** * @fb: * * Currently bound framebuffer. Do not write this directly, use * drm_atomic_set_fb_for_plane() */ struct drm_framebuffer *fb; /** * @fence: * * Optional fence to wait for before scanning out @fb. The core atomic * code will set this when userspace is using explicit fencing. Do not * write this field directly for a driver's implicit fence. * * Drivers should store any implicit fence in this from their * &drm_plane_helper_funcs.prepare_fb callback. See * drm_gem_plane_helper_prepare_fb() for a suitable helper. */ struct dma_fence *fence; /** * @crtc_x: * * Left position of visible portion of plane on crtc, signed dest * location allows it to be partially off screen. */ int32_t crtc_x; /** * @crtc_y: * * Upper position of visible portion of plane on crtc, signed dest * location allows it to be partially off screen. */ int32_t crtc_y; /** @crtc_w: width of visible portion of plane on crtc */ /** @crtc_h: height of visible portion of plane on crtc */ uint32_t crtc_w, crtc_h; /** * @src_x: left position of visible portion of plane within plane (in * 16.16 fixed point). */ uint32_t src_x; /** * @src_y: upper position of visible portion of plane within plane (in * 16.16 fixed point). */ uint32_t src_y; /** @src_w: width of visible portion of plane (in 16.16) */ /** @src_h: height of visible portion of plane (in 16.16) */ uint32_t src_h, src_w; /** * @alpha: * Opacity of the plane with 0 as completely transparent and 0xffff as * completely opaque. See drm_plane_create_alpha_property() for more * details. */ u16 alpha; /** * @pixel_blend_mode: * The alpha blending equation selection, describing how the pixels from * the current plane are composited with the background. Value can be * one of DRM_MODE_BLEND_* */ uint16_t pixel_blend_mode; /** * @rotation: * Rotation of the plane. See drm_plane_create_rotation_property() for * more details. */ unsigned int rotation; /** * @zpos: * Priority of the given plane on crtc (optional). * * User-space may set mutable zpos properties so that multiple active * planes on the same CRTC have identical zpos values. This is a * user-space bug, but drivers can solve the conflict by comparing the * plane object IDs; the plane with a higher ID is stacked on top of a * plane with a lower ID. * * See drm_plane_create_zpos_property() and * drm_plane_create_zpos_immutable_property() for more details. */ unsigned int zpos; /** * @normalized_zpos: * Normalized value of zpos: unique, range from 0 to N-1 where N is the * number of active planes for given crtc. Note that the driver must set * &drm_mode_config.normalize_zpos or call drm_atomic_normalize_zpos() to * update this before it can be trusted. */ unsigned int normalized_zpos; /** * @color_encoding: * * Color encoding for non RGB formats */ enum drm_color_encoding color_encoding; /** * @color_range: * * Color range for non RGB formats */ enum drm_color_range color_range; /** * @fb_damage_clips: * * Blob representing damage (area in plane framebuffer that changed * since last plane update) as an array of &drm_mode_rect in framebuffer * coodinates of the attached framebuffer. Note that unlike plane src, * damage clips are not in 16.16 fixed point. * * See drm_plane_get_damage_clips() and * drm_plane_get_damage_clips_count() for accessing these. */ struct drm_property_blob *fb_damage_clips; /** * @src: * * source coordinates of the plane (in 16.16). * * When using drm_atomic_helper_check_plane_state(), * the coordinates are clipped, but the driver may choose * to use unclipped coordinates instead when the hardware * performs the clipping automatically. */ /** * @dst: * * clipped destination coordinates of the plane. * * When using drm_atomic_helper_check_plane_state(), * the coordinates are clipped, but the driver may choose * to use unclipped coordinates instead when the hardware * performs the clipping automatically. */ struct drm_rect src, dst; /** * @visible: * * Visibility of the plane. This can be false even if fb!=NULL and * crtc!=NULL, due to clipping. */ bool visible; /** * @scaling_filter: * * Scaling filter to be applied */ enum drm_scaling_filter scaling_filter; /** * @commit: Tracks the pending commit to prevent use-after-free conditions, * and for async plane updates. * * May be NULL. */ struct drm_crtc_commit *commit; /** @state: backpointer to global drm_atomic_state */ struct drm_atomic_state *state; }; static inline struct drm_rect drm_plane_state_src(const struct drm_plane_state *state) { struct drm_rect src = { .x1 = state->src_x, .y1 = state->src_y, .x2 = state->src_x + state->src_w, .y2 = state->src_y + state->src_h, }; return src; } static inline struct drm_rect drm_plane_state_dest(const struct drm_plane_state *state) { struct drm_rect dest = { .x1 = state->crtc_x, .y1 = state->crtc_y, .x2 = state->crtc_x + state->crtc_w, .y2 = state->crtc_y + state->crtc_h, }; return dest; } /** * struct drm_plane_funcs - driver plane control functions */ struct drm_plane_funcs { /** * @update_plane: * * This is the legacy entry point to enable and configure the plane for * the given CRTC and framebuffer. It is never called to disable the * plane, i.e. the passed-in crtc and fb paramters are never NULL. * * The source rectangle in frame buffer memory coordinates is given by * the src_x, src_y, src_w and src_h parameters (as 16.16 fixed point * values). Devices that don't support subpixel plane coordinates can * ignore the fractional part. * * The destination rectangle in CRTC coordinates is given by the * crtc_x, crtc_y, crtc_w and crtc_h parameters (as integer values). * Devices scale the source rectangle to the destination rectangle. If * scaling is not supported, and the source rectangle size doesn't match * the destination rectangle size, the driver must return a * -<errorname>EINVAL</errorname> error. * * Drivers implementing atomic modeset should use * drm_atomic_helper_update_plane() to implement this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*update_plane)(struct drm_plane *plane, struct drm_crtc *crtc, struct drm_framebuffer *fb, int crtc_x, int crtc_y, unsigned int crtc_w, unsigned int crtc_h, uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h, struct drm_modeset_acquire_ctx *ctx); /** * @disable_plane: * * This is the legacy entry point to disable the plane. The DRM core * calls this method in response to a DRM_IOCTL_MODE_SETPLANE IOCTL call * with the frame buffer ID set to 0. Disabled planes must not be * processed by the CRTC. * * Drivers implementing atomic modeset should use * drm_atomic_helper_disable_plane() to implement this hook. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*disable_plane)(struct drm_plane *plane, struct drm_modeset_acquire_ctx *ctx); /** * @destroy: * * Clean up plane resources. This is only called at driver unload time * through drm_mode_config_cleanup() since a plane cannot be hotplugged * in DRM. */ void (*destroy)(struct drm_plane *plane); /** * @reset: * * Reset plane hardware and software state to off. This function isn't * called by the core directly, only through drm_mode_config_reset(). * It's not a helper hook only for historical reasons. * * Atomic drivers can use drm_atomic_helper_plane_reset() to reset * atomic state using this hook. */ void (*reset)(struct drm_plane *plane); /** * @set_property: * * This is the legacy entry point to update a property attached to the * plane. * * This callback is optional if the driver does not support any legacy * driver-private properties. For atomic drivers it is not used because * property handling is done entirely in the DRM core. * * RETURNS: * * 0 on success or a negative error code on failure. */ int (*set_property)(struct drm_plane *plane, struct drm_property *property, uint64_t val); /** * @atomic_duplicate_state: * * Duplicate the current atomic state for this plane and return it. * The core and helpers guarantee that any atomic state duplicated with * this hook and still owned by the caller (i.e. not transferred to the * driver by calling &drm_mode_config_funcs.atomic_commit) will be * cleaned up by calling the @atomic_destroy_state hook in this * structure. * * This callback is mandatory for atomic drivers. * * Atomic drivers which don't subclass &struct drm_plane_state should use * drm_atomic_helper_plane_duplicate_state(). Drivers that subclass the * state structure to extend it with driver-private state should use * __drm_atomic_helper_plane_duplicate_state() to make sure shared state is * duplicated in a consistent fashion across drivers. * * It is an error to call this hook before &drm_plane.state has been * initialized correctly. * * NOTE: * * If the duplicate state references refcounted resources this hook must * acquire a reference for each of them. The driver must release these * references again in @atomic_destroy_state. * * RETURNS: * * Duplicated atomic state or NULL when the allocation failed. */ struct drm_plane_state *(*atomic_duplicate_state)(struct drm_plane *plane); /** * @atomic_destroy_state: * * Destroy a state duplicated with @atomic_duplicate_state and release * or unreference all resources it references * * This callback is mandatory for atomic drivers. */ void (*atomic_destroy_state)(struct drm_plane *plane, struct drm_plane_state *state); /** * @atomic_set_property: * * Decode a driver-private property value and store the decoded value * into the passed-in state structure. Since the atomic core decodes all * standardized properties (even for extensions beyond the core set of * properties which might not be implemented by all drivers) this * requires drivers to subclass the state structure. * * Such driver-private properties should really only be implemented for * truly hardware/vendor specific state. Instead it is preferred to * standardize atomic extension and decode the properties used to expose * such an extension in the core. * * Do not call this function directly, use * drm_atomic_plane_set_property() instead. * * This callback is optional if the driver does not support any * driver-private atomic properties. * * NOTE: * * This function is called in the state assembly phase of atomic * modesets, which can be aborted for any reason (including on * userspace's request to just check whether a configuration would be * possible). Drivers MUST NOT touch any persistent state (hardware or * software) or data structures except the passed in @state parameter. * * Also since userspace controls in which order properties are set this * function must not do any input validation (since the state update is * incomplete and hence likely inconsistent). Instead any such input * validation must be done in the various atomic_check callbacks. * * RETURNS: * * 0 if the property has been found, -EINVAL if the property isn't * implemented by the driver (which shouldn't ever happen, the core only * asks for properties attached to this plane). No other validation is * allowed by the driver. The core already checks that the property * value is within the range (integer, valid enum value, ...) the driver * set when registering the property. */ int (*atomic_set_property)(struct drm_plane *plane, struct drm_plane_state *state, struct drm_property *property, uint64_t val); /** * @atomic_get_property: * * Reads out the decoded driver-private property. This is used to * implement the GETPLANE IOCTL. * * Do not call this function directly, use * drm_atomic_plane_get_property() instead. * * This callback is optional if the driver does not support any * driver-private atomic properties. * * RETURNS: * * 0 on success, -EINVAL if the property isn't implemented by the * driver (which should never happen, the core only asks for * properties attached to this plane). */ int (*atomic_get_property)(struct drm_plane *plane, const struct drm_plane_state *state, struct drm_property *property, uint64_t *val); /** * @late_register: * * This optional hook can be used to register additional userspace * interfaces attached to the plane like debugfs interfaces. * It is called late in the driver load sequence from drm_dev_register(). * Everything added from this callback should be unregistered in * the early_unregister callback. * * Returns: * * 0 on success, or a negative error code on failure. */ int (*late_register)(struct drm_plane *plane); /** * @early_unregister: * * This optional hook should be used to unregister the additional * userspace interfaces attached to the plane from * @late_register. It is called from drm_dev_unregister(), * early in the driver unload sequence to disable userspace access * before data structures are torndown. */ void (*early_unregister)(struct drm_plane *plane); /** * @atomic_print_state: * * If driver subclasses &struct drm_plane_state, it should implement * this optional hook for printing additional driver specific state. * * Do not call this directly, use drm_atomic_plane_print_state() * instead. */ void (*atomic_print_state)(struct drm_printer *p, const struct drm_plane_state *state); /** * @format_mod_supported: * * This optional hook is used for the DRM to determine if the given * format/modifier combination is valid for the plane. This allows the * DRM to generate the correct format bitmask (which formats apply to * which modifier), and to validate modifiers at atomic_check time. * * If not present, then any modifier in the plane's modifier * list is allowed with any of the plane's formats. * * Returns: * * True if the given modifier is valid for that format on the plane. * False otherwise. */ bool (*format_mod_supported)(struct drm_plane *plane, uint32_t format, uint64_t modifier); }; /** * enum drm_plane_type - uapi plane type enumeration * * For historical reasons not all planes are made the same. This enumeration is * used to tell the different types of planes apart to implement the different * uapi semantics for them. For userspace which is universal plane aware and * which is using that atomic IOCTL there's no difference between these planes * (beyong what the driver and hardware can support of course). * * For compatibility with legacy userspace, only overlay planes are made * available to userspace by default. Userspace clients may set the * &DRM_CLIENT_CAP_UNIVERSAL_PLANES client capability bit to indicate that they * wish to receive a universal plane list containing all plane types. See also * drm_for_each_legacy_plane(). * * In addition to setting each plane's type, drivers need to setup the * &drm_crtc.primary and optionally &drm_crtc.cursor pointers for legacy * IOCTLs. See drm_crtc_init_with_planes(). * * WARNING: The values of this enum is UABI since they're exposed in the "type" * property. */ enum drm_plane_type { /** * @DRM_PLANE_TYPE_OVERLAY: * * Overlay planes represent all non-primary, non-cursor planes. Some * drivers refer to these types of planes as "sprites" internally. */ DRM_PLANE_TYPE_OVERLAY, /** * @DRM_PLANE_TYPE_PRIMARY: * * A primary plane attached to a CRTC is the most likely to be able to * light up the CRTC when no scaling/cropping is used and the plane * covers the whole CRTC. */ DRM_PLANE_TYPE_PRIMARY, /** * @DRM_PLANE_TYPE_CURSOR: * * A cursor plane attached to a CRTC is more likely to be able to be * enabled when no scaling/cropping is used and the framebuffer has the * size indicated by &drm_mode_config.cursor_width and * &drm_mode_config.cursor_height. Additionally, if the driver doesn't * support modifiers, the framebuffer should have a linear layout. */ DRM_PLANE_TYPE_CURSOR, }; /** * struct drm_plane - central DRM plane control structure * * Planes represent the scanout hardware of a display block. They receive their * input data from a &drm_framebuffer and feed it to a &drm_crtc. Planes control * the color conversion, see `Plane Composition Properties`_ for more details, * and are also involved in the color conversion of input pixels, see `Color * Management Properties`_ for details on that. */ struct drm_plane { /** @dev: DRM device this plane belongs to */ struct drm_device *dev; /** * @head: * * List of all planes on @dev, linked from &drm_mode_config.plane_list. * Invariant over the lifetime of @dev and therefore does not need * locking. */ struct list_head head; /** @name: human readable name, can be overwritten by the driver */ char *name; /** * @mutex: * * Protects modeset plane state, together with the &drm_crtc.mutex of * CRTC this plane is linked to (when active, getting activated or * getting disabled). * * For atomic drivers specifically this protects @state. */ struct drm_modeset_lock mutex; /** @base: base mode object */ struct drm_mode_object base; /** * @possible_crtcs: pipes this plane can be bound to constructed from * drm_crtc_mask() */ uint32_t possible_crtcs; /** @format_types: array of formats supported by this plane */ uint32_t *format_types; /** @format_count: Size of the array pointed at by @format_types. */ unsigned int format_count; /** * @format_default: driver hasn't supplied supported formats for the * plane. Used by the non-atomic driver compatibility wrapper only. */ bool format_default; /** @modifiers: array of modifiers supported by this plane */ uint64_t *modifiers; /** @modifier_count: Size of the array pointed at by @modifier_count. */ unsigned int modifier_count; /** * @crtc: * * Currently bound CRTC, only meaningful for non-atomic drivers. For * atomic drivers this is forced to be NULL, atomic drivers should * instead check &drm_plane_state.crtc. */ struct drm_crtc *crtc; /** * @fb: * * Currently bound framebuffer, only meaningful for non-atomic drivers. * For atomic drivers this is forced to be NULL, atomic drivers should * instead check &drm_plane_state.fb. */ struct drm_framebuffer *fb; /** * @old_fb: * * Temporary tracking of the old fb while a modeset is ongoing. Only * used by non-atomic drivers, forced to be NULL for atomic drivers. */ struct drm_framebuffer *old_fb; /** @funcs: plane control functions */ const struct drm_plane_funcs *funcs; /** @properties: property tracking for this plane */ struct drm_object_properties properties; /** @type: Type of plane, see &enum drm_plane_type for details. */ enum drm_plane_type type; /** * @index: Position inside the mode_config.list, can be used as an array * index. It is invariant over the lifetime of the plane. */ unsigned index; /** @helper_private: mid-layer private data */ const struct drm_plane_helper_funcs *helper_private; /** * @state: * * Current atomic state for this plane. * * This is protected by @mutex. Note that nonblocking atomic commits * access the current plane state without taking locks. Either by going * through the &struct drm_atomic_state pointers, see * for_each_oldnew_plane_in_state(), for_each_old_plane_in_state() and * for_each_new_plane_in_state(). Or through careful ordering of atomic * commit operations as implemented in the atomic helpers, see * &struct drm_crtc_commit. */ struct drm_plane_state *state; /** * @alpha_property: * Optional alpha property for this plane. See * drm_plane_create_alpha_property(). */ struct drm_property *alpha_property; /** * @zpos_property: * Optional zpos property for this plane. See * drm_plane_create_zpos_property(). */ struct drm_property *zpos_property; /** * @rotation_property: * Optional rotation property for this plane. See * drm_plane_create_rotation_property(). */ struct drm_property *rotation_property; /** * @blend_mode_property: * Optional "pixel blend mode" enum property for this plane. * Blend mode property represents the alpha blending equation selection, * describing how the pixels from the current plane are composited with * the background. */ struct drm_property *blend_mode_property; /** * @color_encoding_property: * * Optional "COLOR_ENCODING" enum property for specifying * color encoding for non RGB formats. * See drm_plane_create_color_properties(). */ struct drm_property *color_encoding_property; /** * @color_range_property: * * Optional "COLOR_RANGE" enum property for specifying * color range for non RGB formats. * See drm_plane_create_color_properties(). */ struct drm_property *color_range_property; /** * @scaling_filter_property: property to apply a particular filter while * scaling. */ struct drm_property *scaling_filter_property; }; #define obj_to_plane(x) container_of(x, struct drm_plane, base) __printf(9, 10) int drm_universal_plane_init(struct drm_device *dev, struct drm_plane *plane, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type type, const char *name, ...); void drm_plane_cleanup(struct drm_plane *plane); __printf(10, 11) void *__drmm_universal_plane_alloc(struct drm_device *dev, size_t size, size_t offset, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type plane_type, const char *name, ...); /** * drmm_universal_plane_alloc - Allocate and initialize an universal plane object * @dev: DRM device * @type: the type of the struct which contains struct &drm_plane * @member: the name of the &drm_plane within @type * @possible_crtcs: bitmask of possible CRTCs * @funcs: callbacks for the new plane * @formats: array of supported formats (DRM_FORMAT\_\*) * @format_count: number of elements in @formats * @format_modifiers: array of struct drm_format modifiers terminated by * DRM_FORMAT_MOD_INVALID * @plane_type: type of plane (overlay, primary, cursor) * @name: printf style format string for the plane name, or NULL for default name * * Allocates and initializes a plane object of type @type. Cleanup is * automatically handled through registering drm_plane_cleanup() with * drmm_add_action(). * * The @drm_plane_funcs.destroy hook must be NULL. * * Drivers that only support the DRM_FORMAT_MOD_LINEAR modifier support may set * @format_modifiers to NULL. The plane will advertise the linear modifier. * * Returns: * Pointer to new plane, or ERR_PTR on failure. */ #define drmm_universal_plane_alloc(dev, type, member, possible_crtcs, funcs, formats, \ format_count, format_modifiers, plane_type, name, ...) \ ((type *)__drmm_universal_plane_alloc(dev, sizeof(type), \ offsetof(type, member), \ possible_crtcs, funcs, formats, \ format_count, format_modifiers, \ plane_type, name, ##__VA_ARGS__)) __printf(10, 11) void *__drm_universal_plane_alloc(struct drm_device *dev, size_t size, size_t offset, uint32_t possible_crtcs, const struct drm_plane_funcs *funcs, const uint32_t *formats, unsigned int format_count, const uint64_t *format_modifiers, enum drm_plane_type plane_type, const char *name, ...); /** * drm_universal_plane_alloc() - Allocate and initialize an universal plane object * @dev: DRM device * @type: the type of the struct which contains struct &drm_plane * @member: the name of the &drm_plane within @type * @possible_crtcs: bitmask of possible CRTCs * @funcs: callbacks for the new plane * @formats: array of supported formats (DRM_FORMAT\_\*) * @format_count: number of elements in @formats * @format_modifiers: array of struct drm_format modifiers terminated by * DRM_FORMAT_MOD_INVALID * @plane_type: type of plane (overlay, primary, cursor) * @name: printf style format string for the plane name, or NULL for default name * * Allocates and initializes a plane object of type @type. The caller * is responsible for releasing the allocated memory with kfree(). * * Drivers are encouraged to use drmm_universal_plane_alloc() instead. * * Drivers that only support the DRM_FORMAT_MOD_LINEAR modifier support may set * @format_modifiers to NULL. The plane will advertise the linear modifier. * * Returns: * Pointer to new plane, or ERR_PTR on failure. */ #define drm_universal_plane_alloc(dev, type, member, possible_crtcs, funcs, formats, \ format_count, format_modifiers, plane_type, name, ...) \ ((type *)__drm_universal_plane_alloc(dev, sizeof(type), \ offsetof(type, member), \ possible_crtcs, funcs, formats, \ format_count, format_modifiers, \ plane_type, name, ##__VA_ARGS__)) /** * drm_plane_index - find the index of a registered plane * @plane: plane to find index for * * Given a registered plane, return the index of that plane within a DRM * device's list of planes. */ static inline unsigned int drm_plane_index(const struct drm_plane *plane) { return plane->index; } /** * drm_plane_mask - find the mask of a registered plane * @plane: plane to find mask for */ static inline u32 drm_plane_mask(const struct drm_plane *plane) { return 1 << drm_plane_index(plane); } struct drm_plane * drm_plane_from_index(struct drm_device *dev, int idx); void drm_plane_force_disable(struct drm_plane *plane); int drm_mode_plane_set_obj_prop(struct drm_plane *plane, struct drm_property *property, uint64_t value); /** * drm_plane_find - find a &drm_plane * @dev: DRM device * @file_priv: drm file to check for lease against. * @id: plane id * * Returns the plane with @id, NULL if it doesn't exist. Simple wrapper around * drm_mode_object_find(). */ static inline struct drm_plane *drm_plane_find(struct drm_device *dev, struct drm_file *file_priv, uint32_t id) { struct drm_mode_object *mo; mo = drm_mode_object_find(dev, file_priv, id, DRM_MODE_OBJECT_PLANE); return mo ? obj_to_plane(mo) : NULL; } /** * drm_for_each_plane_mask - iterate over planes specified by bitmask * @plane: the loop cursor * @dev: the DRM device * @plane_mask: bitmask of plane indices * * Iterate over all planes specified by bitmask. */ #define drm_for_each_plane_mask(plane, dev, plane_mask) \ list_for_each_entry((plane), &(dev)->mode_config.plane_list, head) \ for_each_if ((plane_mask) & drm_plane_mask(plane)) /** * drm_for_each_legacy_plane - iterate over all planes for legacy userspace * @plane: the loop cursor * @dev: the DRM device * * Iterate over all legacy planes of @dev, excluding primary and cursor planes. * This is useful for implementing userspace apis when userspace is not * universal plane aware. See also &enum drm_plane_type. */ #define drm_for_each_legacy_plane(plane, dev) \ list_for_each_entry(plane, &(dev)->mode_config.plane_list, head) \ for_each_if (plane->type == DRM_PLANE_TYPE_OVERLAY) /** * drm_for_each_plane - iterate over all planes * @plane: the loop cursor * @dev: the DRM device * * Iterate over all planes of @dev, include primary and cursor planes. */ #define drm_for_each_plane(plane, dev) \ list_for_each_entry(plane, &(dev)->mode_config.plane_list, head) bool drm_any_plane_has_format(struct drm_device *dev, u32 format, u64 modifier); void drm_plane_enable_fb_damage_clips(struct drm_plane *plane); unsigned int drm_plane_get_damage_clips_count(const struct drm_plane_state *state); struct drm_mode_rect * drm_plane_get_damage_clips(const struct drm_plane_state *state); int drm_plane_create_scaling_filter_property(struct drm_plane *plane, unsigned int supported_filters); #endif |
| 39 39 38 3 1 2 38 38 39 39 38 38 43 43 43 43 43 43 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Red Hat * * based in parts on udlfb.c: * Copyright (C) 2009 Roberto De Ioris <roberto@unbit.it> * Copyright (C) 2009 Jaya Kumar <jayakumar.lkml@gmail.com> * Copyright (C) 2009 Bernie Thompson <bernie@plugable.com> */ #include <linux/bitfield.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_crtc_helper.h> #include <drm/drm_damage_helper.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_fourcc.h> #include <drm/drm_gem_atomic_helper.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_plane_helper.h> #include <drm/drm_probe_helper.h> #include <drm/drm_vblank.h> #include "udl_drv.h" #include "udl_proto.h" /* * All DisplayLink bulk operations start with 0xaf (UDL_MSG_BULK), followed by * a specific command code. All operations are written to a command buffer, which * the driver sends to the device. */ static char *udl_set_register(char *buf, u8 reg, u8 val) { *buf++ = UDL_MSG_BULK; *buf++ = UDL_CMD_WRITEREG; *buf++ = reg; *buf++ = val; return buf; } static char *udl_vidreg_lock(char *buf) { return udl_set_register(buf, UDL_REG_VIDREG, UDL_VIDREG_LOCK); } static char *udl_vidreg_unlock(char *buf) { return udl_set_register(buf, UDL_REG_VIDREG, UDL_VIDREG_UNLOCK); } static char *udl_set_blank_mode(char *buf, u8 mode) { return udl_set_register(buf, UDL_REG_BLANKMODE, mode); } static char *udl_set_color_depth(char *buf, u8 selection) { return udl_set_register(buf, UDL_REG_COLORDEPTH, selection); } static char *udl_set_base16bpp(char *buf, u32 base) { /* the base pointer is 24 bits wide, 0x20 is hi byte. */ u8 reg20 = FIELD_GET(UDL_BASE_ADDR2_MASK, base); u8 reg21 = FIELD_GET(UDL_BASE_ADDR1_MASK, base); u8 reg22 = FIELD_GET(UDL_BASE_ADDR0_MASK, base); buf = udl_set_register(buf, UDL_REG_BASE16BPP_ADDR2, reg20); buf = udl_set_register(buf, UDL_REG_BASE16BPP_ADDR1, reg21); buf = udl_set_register(buf, UDL_REG_BASE16BPP_ADDR0, reg22); return buf; } /* * DisplayLink HW has separate 16bpp and 8bpp framebuffers. * In 24bpp modes, the low 323 RGB bits go in the 8bpp framebuffer */ static char *udl_set_base8bpp(char *buf, u32 base) { /* the base pointer is 24 bits wide, 0x26 is hi byte. */ u8 reg26 = FIELD_GET(UDL_BASE_ADDR2_MASK, base); u8 reg27 = FIELD_GET(UDL_BASE_ADDR1_MASK, base); u8 reg28 = FIELD_GET(UDL_BASE_ADDR0_MASK, base); buf = udl_set_register(buf, UDL_REG_BASE8BPP_ADDR2, reg26); buf = udl_set_register(buf, UDL_REG_BASE8BPP_ADDR1, reg27); buf = udl_set_register(buf, UDL_REG_BASE8BPP_ADDR0, reg28); return buf; } static char *udl_set_register_16(char *wrptr, u8 reg, u16 value) { wrptr = udl_set_register(wrptr, reg, value >> 8); return udl_set_register(wrptr, reg+1, value); } /* * This is kind of weird because the controller takes some * register values in a different byte order than other registers. */ static char *udl_set_register_16be(char *wrptr, u8 reg, u16 value) { wrptr = udl_set_register(wrptr, reg, value); return udl_set_register(wrptr, reg+1, value >> 8); } /* * LFSR is linear feedback shift register. The reason we have this is * because the display controller needs to minimize the clock depth of * various counters used in the display path. So this code reverses the * provided value into the lfsr16 value by counting backwards to get * the value that needs to be set in the hardware comparator to get the * same actual count. This makes sense once you read above a couple of * times and think about it from a hardware perspective. */ static u16 udl_lfsr16(u16 actual_count) { u32 lv = 0xFFFF; /* This is the lfsr value that the hw starts with */ while (actual_count--) { lv = ((lv << 1) | (((lv >> 15) ^ (lv >> 4) ^ (lv >> 2) ^ (lv >> 1)) & 1)) & 0xFFFF; } return (u16) lv; } /* * This does LFSR conversion on the value that is to be written. * See LFSR explanation above for more detail. */ static char *udl_set_register_lfsr16(char *wrptr, u8 reg, u16 value) { return udl_set_register_16(wrptr, reg, udl_lfsr16(value)); } /* * Takes a DRM display mode and converts it into the DisplayLink * equivalent register commands. */ static char *udl_set_display_mode(char *buf, struct drm_display_mode *mode) { u16 reg01 = mode->crtc_htotal - mode->crtc_hsync_start; u16 reg03 = reg01 + mode->crtc_hdisplay; u16 reg05 = mode->crtc_vtotal - mode->crtc_vsync_start; u16 reg07 = reg05 + mode->crtc_vdisplay; u16 reg09 = mode->crtc_htotal - 1; u16 reg0b = 1; /* libdlo hardcodes hsync start to 1 */ u16 reg0d = mode->crtc_hsync_end - mode->crtc_hsync_start + 1; u16 reg0f = mode->hdisplay; u16 reg11 = mode->crtc_vtotal; u16 reg13 = 0; /* libdlo hardcodes vsync start to 0 */ u16 reg15 = mode->crtc_vsync_end - mode->crtc_vsync_start; u16 reg17 = mode->crtc_vdisplay; u16 reg1b = mode->clock / 5; buf = udl_set_register_lfsr16(buf, UDL_REG_XDISPLAYSTART, reg01); buf = udl_set_register_lfsr16(buf, UDL_REG_XDISPLAYEND, reg03); buf = udl_set_register_lfsr16(buf, UDL_REG_YDISPLAYSTART, reg05); buf = udl_set_register_lfsr16(buf, UDL_REG_YDISPLAYEND, reg07); buf = udl_set_register_lfsr16(buf, UDL_REG_XENDCOUNT, reg09); buf = udl_set_register_lfsr16(buf, UDL_REG_HSYNCSTART, reg0b); buf = udl_set_register_lfsr16(buf, UDL_REG_HSYNCEND, reg0d); buf = udl_set_register_16(buf, UDL_REG_HPIXELS, reg0f); buf = udl_set_register_lfsr16(buf, UDL_REG_YENDCOUNT, reg11); buf = udl_set_register_lfsr16(buf, UDL_REG_VSYNCSTART, reg13); buf = udl_set_register_lfsr16(buf, UDL_REG_VSYNCEND, reg15); buf = udl_set_register_16(buf, UDL_REG_VPIXELS, reg17); buf = udl_set_register_16be(buf, UDL_REG_PIXELCLOCK5KHZ, reg1b); return buf; } static char *udl_dummy_render(char *wrptr) { *wrptr++ = UDL_MSG_BULK; *wrptr++ = UDL_CMD_WRITECOPY16; *wrptr++ = 0x00; /* from addr */ *wrptr++ = 0x00; *wrptr++ = 0x00; *wrptr++ = 0x01; /* one pixel */ *wrptr++ = 0x00; /* to address */ *wrptr++ = 0x00; *wrptr++ = 0x00; return wrptr; } static long udl_log_cpp(unsigned int cpp) { if (WARN_ON(!is_power_of_2(cpp))) return -EINVAL; return __ffs(cpp); } static int udl_handle_damage(struct drm_framebuffer *fb, const struct iosys_map *map, const struct drm_rect *clip) { struct drm_device *dev = fb->dev; void *vaddr = map->vaddr; /* TODO: Use mapping abstraction properly */ int i, ret; char *cmd; struct urb *urb; int log_bpp; ret = udl_log_cpp(fb->format->cpp[0]); if (ret < 0) return ret; log_bpp = ret; urb = udl_get_urb(dev); if (!urb) return -ENOMEM; cmd = urb->transfer_buffer; for (i = clip->y1; i < clip->y2; i++) { const int line_offset = fb->pitches[0] * i; const int byte_offset = line_offset + (clip->x1 << log_bpp); const int dev_byte_offset = (fb->width * i + clip->x1) << log_bpp; const int byte_width = drm_rect_width(clip) << log_bpp; ret = udl_render_hline(dev, log_bpp, &urb, (char *)vaddr, &cmd, byte_offset, dev_byte_offset, byte_width); if (ret) return ret; } if (cmd > (char *)urb->transfer_buffer) { /* Send partial buffer remaining before exiting */ int len; if (cmd < (char *)urb->transfer_buffer + urb->transfer_buffer_length) *cmd++ = UDL_MSG_BULK; len = cmd - (char *)urb->transfer_buffer; ret = udl_submit_urb(dev, urb, len); } else { udl_urb_completion(urb); } return 0; } /* * Primary plane */ static const uint32_t udl_primary_plane_formats[] = { DRM_FORMAT_RGB565, DRM_FORMAT_XRGB8888, }; static const uint64_t udl_primary_plane_fmtmods[] = { DRM_FORMAT_MOD_LINEAR, DRM_FORMAT_MOD_INVALID }; static void udl_primary_plane_helper_atomic_update(struct drm_plane *plane, struct drm_atomic_state *state) { struct drm_device *dev = plane->dev; struct drm_plane_state *plane_state = drm_atomic_get_new_plane_state(state, plane); struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(plane_state); struct drm_framebuffer *fb = plane_state->fb; struct drm_plane_state *old_plane_state = drm_atomic_get_old_plane_state(state, plane); struct drm_atomic_helper_damage_iter iter; struct drm_rect damage; int ret, idx; if (!fb) return; /* no framebuffer; plane is disabled */ ret = drm_gem_fb_begin_cpu_access(fb, DMA_FROM_DEVICE); if (ret) return; if (!drm_dev_enter(dev, &idx)) goto out_drm_gem_fb_end_cpu_access; drm_atomic_helper_damage_iter_init(&iter, old_plane_state, plane_state); drm_atomic_for_each_plane_damage(&iter, &damage) { udl_handle_damage(fb, &shadow_plane_state->data[0], &damage); } drm_dev_exit(idx); out_drm_gem_fb_end_cpu_access: drm_gem_fb_end_cpu_access(fb, DMA_FROM_DEVICE); } static const struct drm_plane_helper_funcs udl_primary_plane_helper_funcs = { DRM_GEM_SHADOW_PLANE_HELPER_FUNCS, .atomic_check = drm_plane_helper_atomic_check, .atomic_update = udl_primary_plane_helper_atomic_update, }; static const struct drm_plane_funcs udl_primary_plane_funcs = { .update_plane = drm_atomic_helper_update_plane, .disable_plane = drm_atomic_helper_disable_plane, .destroy = drm_plane_cleanup, DRM_GEM_SHADOW_PLANE_FUNCS, }; /* * CRTC */ static void udl_crtc_helper_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_state *state) { struct drm_device *dev = crtc->dev; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); struct drm_display_mode *mode = &crtc_state->mode; struct urb *urb; char *buf; int idx; if (!drm_dev_enter(dev, &idx)) return; urb = udl_get_urb(dev); if (!urb) goto out; buf = (char *)urb->transfer_buffer; buf = udl_vidreg_lock(buf); buf = udl_set_color_depth(buf, UDL_COLORDEPTH_16BPP); /* set base for 16bpp segment to 0 */ buf = udl_set_base16bpp(buf, 0); /* set base for 8bpp segment to end of fb */ buf = udl_set_base8bpp(buf, 2 * mode->vdisplay * mode->hdisplay); buf = udl_set_display_mode(buf, mode); buf = udl_set_blank_mode(buf, UDL_BLANKMODE_ON); buf = udl_vidreg_unlock(buf); buf = udl_dummy_render(buf); udl_submit_urb(dev, urb, buf - (char *)urb->transfer_buffer); out: drm_dev_exit(idx); } static void udl_crtc_helper_atomic_disable(struct drm_crtc *crtc, struct drm_atomic_state *state) { struct drm_device *dev = crtc->dev; struct urb *urb; char *buf; int idx; if (!drm_dev_enter(dev, &idx)) return; urb = udl_get_urb(dev); if (!urb) goto out; buf = (char *)urb->transfer_buffer; buf = udl_vidreg_lock(buf); buf = udl_set_blank_mode(buf, UDL_BLANKMODE_POWERDOWN); buf = udl_vidreg_unlock(buf); buf = udl_dummy_render(buf); udl_submit_urb(dev, urb, buf - (char *)urb->transfer_buffer); out: drm_dev_exit(idx); } static const struct drm_crtc_helper_funcs udl_crtc_helper_funcs = { .atomic_check = drm_crtc_helper_atomic_check, .atomic_enable = udl_crtc_helper_atomic_enable, .atomic_disable = udl_crtc_helper_atomic_disable, }; static const struct drm_crtc_funcs udl_crtc_funcs = { .reset = drm_atomic_helper_crtc_reset, .destroy = drm_crtc_cleanup, .set_config = drm_atomic_helper_set_config, .page_flip = drm_atomic_helper_page_flip, .atomic_duplicate_state = drm_atomic_helper_crtc_duplicate_state, .atomic_destroy_state = drm_atomic_helper_crtc_destroy_state, }; /* * Encoder */ static const struct drm_encoder_funcs udl_encoder_funcs = { .destroy = drm_encoder_cleanup, }; /* * Connector */ static int udl_connector_helper_get_modes(struct drm_connector *connector) { struct udl_connector *udl_connector = to_udl_connector(connector); drm_connector_update_edid_property(connector, udl_connector->edid); if (udl_connector->edid) return drm_add_edid_modes(connector, udl_connector->edid); return 0; } static const struct drm_connector_helper_funcs udl_connector_helper_funcs = { .get_modes = udl_connector_helper_get_modes, }; static int udl_get_edid_block(void *data, u8 *buf, unsigned int block, size_t len) { struct udl_device *udl = data; struct drm_device *dev = &udl->drm; struct usb_device *udev = udl_to_usb_device(udl); u8 *read_buff; int ret; size_t i; read_buff = kmalloc(2, GFP_KERNEL); if (!read_buff) return -ENOMEM; for (i = 0; i < len; i++) { int bval = (i + block * EDID_LENGTH) << 8; ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x02, (0x80 | (0x02 << 5)), bval, 0xA1, read_buff, 2, USB_CTRL_GET_TIMEOUT); if (ret < 0) { drm_err(dev, "Read EDID byte %zu failed err %x\n", i, ret); goto err_kfree; } else if (ret < 1) { ret = -EIO; drm_err(dev, "Read EDID byte %zu failed\n", i); goto err_kfree; } buf[i] = read_buff[1]; } kfree(read_buff); return 0; err_kfree: kfree(read_buff); return ret; } static enum drm_connector_status udl_connector_detect(struct drm_connector *connector, bool force) { struct drm_device *dev = connector->dev; struct udl_device *udl = to_udl(dev); struct udl_connector *udl_connector = to_udl_connector(connector); enum drm_connector_status status = connector_status_disconnected; int idx; /* cleanup previous EDID */ kfree(udl_connector->edid); udl_connector->edid = NULL; if (!drm_dev_enter(dev, &idx)) return connector_status_disconnected; udl_connector->edid = drm_do_get_edid(connector, udl_get_edid_block, udl); if (udl_connector->edid) status = connector_status_connected; drm_dev_exit(idx); return status; } static void udl_connector_destroy(struct drm_connector *connector) { struct udl_connector *udl_connector = to_udl_connector(connector); drm_connector_cleanup(connector); kfree(udl_connector->edid); kfree(udl_connector); } static const struct drm_connector_funcs udl_connector_funcs = { .reset = drm_atomic_helper_connector_reset, .detect = udl_connector_detect, .fill_modes = drm_helper_probe_single_connector_modes, .destroy = udl_connector_destroy, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; struct drm_connector *udl_connector_init(struct drm_device *dev) { struct udl_connector *udl_connector; struct drm_connector *connector; int ret; udl_connector = kzalloc(sizeof(*udl_connector), GFP_KERNEL); if (!udl_connector) return ERR_PTR(-ENOMEM); connector = &udl_connector->connector; ret = drm_connector_init(dev, connector, &udl_connector_funcs, DRM_MODE_CONNECTOR_VGA); if (ret) goto err_kfree; drm_connector_helper_add(connector, &udl_connector_helper_funcs); connector->polled = DRM_CONNECTOR_POLL_HPD | DRM_CONNECTOR_POLL_CONNECT | DRM_CONNECTOR_POLL_DISCONNECT; return connector; err_kfree: kfree(udl_connector); return ERR_PTR(ret); } /* * Modesetting */ static enum drm_mode_status udl_mode_config_mode_valid(struct drm_device *dev, const struct drm_display_mode *mode) { struct udl_device *udl = to_udl(dev); if (udl->sku_pixel_limit) { if (mode->vdisplay * mode->hdisplay > udl->sku_pixel_limit) return MODE_MEM; } return MODE_OK; } static const struct drm_mode_config_funcs udl_mode_config_funcs = { .fb_create = drm_gem_fb_create_with_dirty, .mode_valid = udl_mode_config_mode_valid, .atomic_check = drm_atomic_helper_check, .atomic_commit = drm_atomic_helper_commit, }; int udl_modeset_init(struct drm_device *dev) { struct udl_device *udl = to_udl(dev); struct drm_plane *primary_plane; struct drm_crtc *crtc; struct drm_encoder *encoder; struct drm_connector *connector; int ret; ret = drmm_mode_config_init(dev); if (ret) return ret; dev->mode_config.min_width = 640; dev->mode_config.min_height = 480; dev->mode_config.max_width = 2048; dev->mode_config.max_height = 2048; dev->mode_config.preferred_depth = 16; dev->mode_config.funcs = &udl_mode_config_funcs; primary_plane = &udl->primary_plane; ret = drm_universal_plane_init(dev, primary_plane, 0, &udl_primary_plane_funcs, udl_primary_plane_formats, ARRAY_SIZE(udl_primary_plane_formats), udl_primary_plane_fmtmods, DRM_PLANE_TYPE_PRIMARY, NULL); if (ret) return ret; drm_plane_helper_add(primary_plane, &udl_primary_plane_helper_funcs); drm_plane_enable_fb_damage_clips(primary_plane); crtc = &udl->crtc; ret = drm_crtc_init_with_planes(dev, crtc, primary_plane, NULL, &udl_crtc_funcs, NULL); if (ret) return ret; drm_crtc_helper_add(crtc, &udl_crtc_helper_funcs); encoder = &udl->encoder; ret = drm_encoder_init(dev, encoder, &udl_encoder_funcs, DRM_MODE_ENCODER_DAC, NULL); if (ret) return ret; encoder->possible_crtcs = drm_crtc_mask(crtc); connector = udl_connector_init(dev); if (IS_ERR(connector)) return PTR_ERR(connector); ret = drm_connector_attach_encoder(connector, encoder); if (ret) return ret; drm_mode_config_reset(dev); return 0; } |
| 2692 2692 2695 2695 2693 2694 2694 2694 2691 2691 506 506 505 506 506 506 506 506 2693 506 2262 506 285 384 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /** * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /** * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /** * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /** * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /** * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } static u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /** * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /** * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
| 62 53 53 52 52 52 63 1047 1047 1047 1047 1047 1047 1047 1047 1047 2692 406 406 406 406 2691 2692 2692 2691 2692 1047 1047 1047 1047 1047 63 1047 63 1047 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ struct cpu_fbatches { local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch activate; #endif }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct folio *folio) { if (folio_test_lru(folio)) { struct lruvec *lruvec; unsigned long flags; lruvec = folio_lruvec_lock_irqsave(folio, &flags); lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); unlock_page_lruvec_irqrestore(lruvec, flags); } /* See comment on folio_test_mlocked in release_pages() */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); __folio_clear_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } } static void __folio_put_small(struct folio *folio) { __page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } static void __folio_put_large(struct folio *folio) { /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) __page_cache_release(folio); destroy_large_folio(folio); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) free_zone_device_page(&folio->page); else if (unlikely(folio_test_large(folio))) __folio_put_large(folio); else __folio_put_small(folio); } EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { struct folio *folio, *next; list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) { list_del(&folio->lru); continue; } if (folio_test_large(folio)) { list_del(&folio->lru); __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ } free_unref_page_list(pages); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of release_pages(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, struct folio *folio, move_fn_t move_fn) { if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && !lru_cache_disabled()) return; folio_batch_move_lru(fbatch, move_fn); } static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio) && folio_test_lru(folio)) { struct folio_batch *fbatch; unsigned long flags; folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); fbatch = this_cpu_ptr(&lru_rotate.fbatch); folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, folio_activate_fn); } void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); local_unlock(&cpu_fbatches.lock); } } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (folio_test_clear_lru(folio)) { lruvec = folio_lruvec_lock_irq(folio); folio_activate_fn(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void folio_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; if (!folio_test_referenced(folio)) { folio_set_referenced(folio); return; } if (!folio_test_workingset(folio)) { folio_set_workingset(folio); return; } /* see the comment on MAX_NR_TIERS */ do { new_flags = old_flags & LRU_REFS_MASK; if (new_flags == LRU_REFS_MASK) break; new_flags += BIT(LRU_REFS_PGOFF); new_flags |= old_flags & ~LRU_REFS_MASK; } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } #else static void folio_inc_refs(struct folio *folio) { } #endif /* CONFIG_LRU_GEN */ /* * Mark a page as having seen activity. * * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced * * When a newly allocated page is not yet visible, so safe for non-atomic ops, * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void folio_mark_accessed(struct folio *folio) { if (lru_gen_enabled()) { folio_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { struct folio_batch *fbatch; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_add_folio() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); folio_batch_add_and_move(fbatch, folio, lru_add_fn); local_unlock(&cpu_fbatches.lock); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) { if (folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); folio_batch_move_lru(fbatch, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { struct folio_batch *fbatch; /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); local_unlock(&cpu_fbatches.lock); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); local_unlock(&cpu_fbatches.lock); } } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (folio_test_lru(folio) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); local_unlock(&cpu_fbatches.lock); } } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of pages to be migrated using isolate_lru_page(). * It drains pages on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { int i; struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { struct folio *folio; /* Turn any of the argument types into a folio */ folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the * same lruvec. The lock is held only if lruvec != NULL. */ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (is_huge_zero_page(&folio->page)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (put_devmap_managed_page(&folio->page)) continue; if (folio_put_testzero(folio)) free_zone_device_page(&folio->page); continue; } if (!folio_put_testzero(folio)) continue; if (folio_test_large(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } __folio_put_large(folio); continue; } if (folio_test_lru(folio)) { struct lruvec *prev_lruvec = lruvec; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); if (prev_lruvec != lruvec) lock_batch = 0; lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); } /* * In rare cases, when truncation or holepunching raced with * munlock after VM_LOCKED was cleared, Mlocked may still be * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large. */ if (unlikely(folio_test_mlocked(folio))) { __folio_clear_mlocked(folio); zone_stat_sub_folio(folio, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); } list_add(&folio->lru, &pages_to_free); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } release_pages(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ } |
| 3708 3709 113 3698 2 2 1 3695 1 1 1 3695 3674 61 61 60 1 1 61 3696 3696 3696 2 3696 3694 3875 3874 181 14 3874 170 40 130 141 3875 1544 709 3873 2931 3874 2931 3875 3875 3874 3601 3601 3693 3696 3695 106 3695 3696 3694 3694 3695 3695 1 1 1 3695 3695 3694 3694 3694 268 3695 806 3669 3670 3670 3670 113 113 113 1 112 112 112 1 1 3720 22 3515 3089 3089 3089 3516 3516 3516 373 3386 3335 3335 3384 3386 3386 3386 3386 3386 3335 105 3386 3386 296 296 296 555 555 362 442 442 428 427 428 428 428 427 11 427 427 16 428 428 428 428 866 864 3441 157 157 157 157 157 3360 3384 3384 3384 3384 3383 3384 45 3384 3547 3531 3384 1 278 277 277 278 179 179 541 3540 3538 498 3531 498 112 112 103 707 378 397 9 9 9 981 982 981 982 10 3694 268 712 555 555 555 555 555 555 289 528 166 166 555 179 555 464 439 47 321 464 555 106 106 555 3442 556 555 3442 3383 3547 3483 1 816 277 3539 555 528 289 289 21 21 448 21 21 21 21 448 3670 446 446 3460 3458 445 3459 3673 3799 3966 157 157 157 3693 3686 3695 3696 3615 3695 3322 3693 2 3694 3694 3694 3695 21 21 3685 3695 3665 3664 3693 3693 493 3665 3664 493 3663 2 2 3695 3695 3695 2 3696 4 3696 3695 3694 3696 3696 3322 3322 494 448 448 54 21 21 21 468 243 1 242 247 247 172 172 18 162 247 106 106 1 106 106 61 541 541 541 541 541 541 541 541 541 541 541 157 157 157 157 106 17 302 2985 2985 3592 3592 68 58 2985 2984 913 3590 3589 3593 124 124 124 301 301 300 868 868 867 741 176 3 3 867 745 362 745 301 87 301 301 1 744 741 740 740 285 285 283 357 8 3613 2 2 3611 3333 1 3334 3334 456 458 424 458 867 571 456 456 868 861 8 283 570 861 283 3419 3419 120 3598 3407 3598 3597 454 449 3593 3590 283 3393 130 3591 3587 3586 3373 3373 424 424 129 7 6 2 2 7 2 2 7 7 6 135 61 78 3661 3660 3655 3616 3616 3429 3570 194 3387 3659 594 3661 1 3659 3660 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/ima.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } result->refcnt = 1; /* The empty path is special. */ if (unlikely(!len)) { if (empty) *empty = 1; if (!(flags & LOOKUP_EMPTY)) { putname(result); return ERR_PTR(-ENOENT); } } result->uptr = filename; result->aname = NULL; audit_getname(result); return result; } struct filename * getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags, NULL); } struct filename * getname(const char __user * filename) { return getname_flags(filename, 0, NULL); } struct filename * getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; result->refcnt = 1; audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { if (IS_ERR(name)) return; BUG_ON(name->refcnt <= 0); if (--name->refcnt > 0) return; if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { } }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid; if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || (dir_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { const char *operation = S_ISFIFO(inode->i_mode) ? "sticky_create_fifo" : "sticky_create_regular"; audit_log_path_denied(AUDIT_ANOM_CREAT, operation); return -EACCES; } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd)) return err; } return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ static inline u64 hash_name(const void *salt, const char *name) { unsigned long a = 0, b, x = 0, y = (unsigned long)salt; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline u64 hash_name(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); return hashlen_create(end_name_hash(hash), len); } #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; u64 hash_len; int type; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; hash_len = hash_name(nd->path.dentry, name); type = LAST_NORM; if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; nd->state |= ND_JUMPED; } break; case 1: type = LAST_DOT; } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) return err; hash_len = this.hash_len; name = this.name; } } nd->last.hash_len = hash_len; nd->last.name = name; nd->last_type = type; name += hashlen_len(hash_len); if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->name->name; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); struct dentry *dentry; if (!f.file) return ERR_PTR(-EBADF); dentry = f.file->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) { fdput(f); return ERR_PTR(-ENOTDIR); } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } fdput(f); } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { path_put(path); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); } return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(filename, path); putname(filename); return res; } int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (unlikely(name[0] == '.')) { if (len < 2 || (len == 2 && name[1] == '.')) return -EACCES; } while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The helper should be called without i_mutex held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); /* * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_one_len_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { struct filename *filename = getname_flags(name, flags, empty); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at_empty); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p; p = d_ancestor(p2, p1); if (p) { inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } lock_two_inodes(p1->d_inode, p2->d_inode, I_MUTEX_PARENT, I_MUTEX_PARENT2); return NULL; } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * mode_strip_umask - handle vfs umask stripping * @dir: parent directory of the new inode * @mode: mode of the new inode to be created in @dir * * Umask stripping depends on whether or not the filesystem supports POSIX * ACLs. If the filesystem doesn't support it umask stripping is done directly * in here. If the filesystem does support POSIX ACLs umask stripping is * deferred until the filesystem calls posix_acl_create(). * * Returns: mode */ static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) { if (!IS_POSIXACL(dir)) mode &= ~current_umask(); return mode; } /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) fsnotify_create(dir->d_inode, dentry); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } ima_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; error = -EEXIST; if (d_is_positive(dentry)) goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } if (unlikely(err2)) { error = err2; goto fail; } return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new device node or file * @dev: device number of device to create * * Create a device node or file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) goto out1; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; error = security_path_mknod(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) ima_post_path_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out1: putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** * vfs_mkdir - create directory * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @mode: mode of the new directory * * Create a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; if (max_links && dir->i_nlink >= max_links) return -EMLINK; error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkdir); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * * Remove a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { int error = may_delete(idmap, dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry) || (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit2; case LAST_DOT: error = -EINVAL; goto exit2; case LAST_ROOT: error = -EBUSY; goto exit2; } error = mnt_want_write(path.mnt); if (error) goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; if (!dentry->d_inode) { error = -ENOENT; goto exit4; } error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } exit1: putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, getname(pathname)); } /** * vfs_unlink - unlink a filesystem object * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_mutex. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (IS_SWAPFILE(target)) error = -EPERM; else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { fsnotify_unlink(dir, dentry); } else if (!error) { fsnotify_link_count(target); d_delete_notify(dir, dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; error = -EISDIR; if (type != LAST_NORM) goto exit2; error = mnt_want_write(path.mnt); if (error) goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; inode = dentry->d_inode; if (d_is_negative(dentry)) goto slashes; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit3; error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } exit1: putname(name); return error; slashes: if (d_is_negative(dentry)) error = -ENOENT; else if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, getname(pathname)); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from * @dir: inode of @dentry * @dentry: pointer to dentry of the base directory * @oldname: name of the file to link to * * Create a symlink. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; if (IS_ERR(from)) { error = PTR_ERR(from); goto out_putnames; } retry: dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putnames: putname(to); putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_mutex * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(idmap, dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { error = -EINVAL; goto out_putnames; } /* * To use null names we require CAP_DAC_READ_SEARCH * This ensures that not everyone will be able to create * handlink using the passed filedescriptor. */ if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) { error = -ENOENT; goto out_putnames; } if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; idmap = mnt_idmap(new_path.mnt); error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out_putpath: path_put(&old_path); out_putnames: putname(old); putname(new); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, getname_uflags(oldname, flags), newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** * vfs_rename - rename a filesystem object * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we have to lock _four_ objects - parents and victim (if it exists), * and source. * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; if (source == target) return 0; error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* * Lock all moved children. Moved directories may need to change parent * pointer so they need the lock to prevent against concurrent * directory changes moving parent pointer. For regular files we've * historically always done this. The lockdep locking subclasses are * somewhat arbitrary but RENAME_EXCHANGE in particular can swap * regular files and directories so it's difficult to tell which * subclasses to use. */ lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: inode_unlock(source); if (target) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, &old_dentry->d_name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); if (error) goto put_names; error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); if (error) goto exit1; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; /* source must exist */ error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; error = -EEXIST; if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) goto exit5; if (flags & RENAME_EXCHANGE) { error = -ENOENT; if (d_is_negative(new_dentry)) goto exit5; if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); exit1: path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } put_names: putname(from); putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); if (IS_ERR(link)) goto out; len = strlen(link); if (len > (unsigned) buflen) len = buflen; if (copy_to_user(buffer, link, len)) len = -EFAULT; out: return len; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; if (!dentry) { page = find_get_page(mapping, 0); if (!page) return ERR_PTR(-ECHILD); if (!PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } } else { page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); kaddr = page_address(page); nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); void page_put_link(void *arg) { put_page(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { DEFINE_DELAYED_CALL(done); int res = readlink_copy(buffer, buflen, page_get_link(dentry, d_inode(dentry), &done)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; void *fsdata = NULL; int err; unsigned int flags; retry: if (nofs) flags = memalloc_nofs_save(); err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; memcpy(page_address(page), symname, len-1); err = aops->write_end(NULL, mapping, 0, len-1, len-1, page, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations); |
| 633 911 27 952 6 2 17 7 681 952 636 681 685 685 32 3 956 951 907 685 682 2 682 200 925 952 251 926 926 926 230 681 681 535 535 535 17 675 6 636 31 635 29 633 29 633 636 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H #include <linux/err.h> #include <linux/errno.h> #include <linux/jhash.h> #include <linux/list_nulls.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <linux/bit_spinlock.h> #include <linux/rhashtable-types.h> /* * Objects in an rhashtable have an embedded struct rhash_head * which is linked into as hash chain from the hash table - or one * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ struct rhash_lock_head {}; /* Maximum chain length before rehash * * The maximum (not average) chain length grows with the size of the hash * table, at a rate of (log N)/(log log N). * * The value of 16 is selected so that even if the hash table grew to * 2^32 you would not expect the maximum chain length to exceed it * unless we are under attack (or extremely unlucky). * * As this limit is only to detect attacks, we don't need to set it to a * lower value as you'd need the chain length to vastly exceed 16 to have * any real effect on the system. */ #define RHT_ELASTICITY 16u /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; struct lockdep_map dep_map; struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. * * We never store the NULLS_MARKER in the hash table * itself as we need the lsb for locking. * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (char *)he - ht->p.head_offset; } static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else hash = jhash(key, key_len, hash_rnd); } return hash; } static inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); return rht_bucket_index(tbl, hash); } static inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { const char *ptr = rht_obj(ht, he); return likely(params.obj_hashfn) ? rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: ht->p.key_len, tbl->hash_rnd)) : rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } /** * rht_grow_above_75 - returns true if nelems > 0.75 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && tbl->size > ht->p.min_size; } /** * rht_grow_above_100 - returns true if nelems > table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) > tbl->size && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_grow_above_max - returns true if table is above maximum * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) >= ht->max_elems; } #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { return 1; } #endif /* CONFIG_PROVE_LOCKING */ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); static inline void rhashtable_walk_start(struct rhashtable_iter *iter) { (void)rhashtable_walk_start_check(iter); } void *rhashtable_walk_next(struct rhashtable_iter *iter); void *rhashtable_walk_peek(struct rhashtable_iter *iter); void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg); void rhashtable_destroy(struct rhashtable *ht); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : &tbl->buckets[hash]; } /* * We lock a bucket by setting BIT(0) in the pointer - this is always * zero in real pointers. The NULLS mark is never stored in the bucket, * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case * we cannot get a lock. For remove and replace the bucket cannot be * interesting and doesn't need locking. * For insert we allocate the bucket if this is the last bucket_table, * and then take the lock. * Sometimes we unlock a bucket by writing a new pointer there. In that * case we don't need to unlock, but we do need to reset state such as * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline unsigned long rht_lock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); return flags; } static inline unsigned long rht_lock_nested(struct bucket_table *tbl, struct rhash_lock_head __rcu **bucket, unsigned int subclass) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); return flags; } static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { return (struct rhash_head *) ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* * Where 'bkt' is a bucket and might be locked: * rht_ptr_rcu() dereferences that pointer and clears the lock bit. * rht_ptr() dereferences in a context where the bucket is locked. * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, struct rhash_head *obj, unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_irq_restore(flags); } /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ for (pos = head; \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each - iterate over hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ rht_for_each_entry_from(tpos, pos, \ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @next: the &struct rhash_head to use as next in loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next, \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ rht_ptr_rcu(rht_bucket(tbl, hash)), \ tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * @member: name of the &struct rlist_head within the hashable struct. * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { struct rhashtable *ht = arg->ht; const char *ptr = obj; return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } /* Internal function, do not use. */ static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; return he; } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; return NULL; } /** * rhashtable_lookup - search hash table * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * This must only be called under the RCU read lock. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(ht, key, params); return he ? rht_obj(ht, he) : NULL; } /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * Only use this function when you have other mechanisms guaranteeing * that the object won't go away after the RCU read lock is released. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { void *obj; rcu_read_lock(); obj = rhashtable_lookup(ht, key, params); rcu_read_unlock(); return obj; } /** * rhltable_lookup - search hash list table * @hlt: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. All matching entries are returned * in a list. * * This must only be called under the RCU read lock. * * Returns the list of entries that match the given key. */ static inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes in there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; unsigned long flags; unsigned int hash; int elasticity; void *data; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); elasticity = RHT_ELASTICITY; bkt = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!bkt) goto out; pprev = NULL; flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; elasticity--; if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } data = rht_obj(ht, head); if (!rhlist) goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } if (elasticity <= 0) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; /* Inserting at head of list makes unlocking free. */ head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } atomic_inc(&ht->nelems); rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); data = NULL; out: rcu_read_unlock(); return data; out_unlock: rht_unlock(tbl, bkt, flags); goto out; } /** * rhashtable_insert_fast - insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhltable_insert_key - insert object into hash list table * @hlt: hash list table * @key: the pointer to the key * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, params, true)); } /** * rhltable_insert - insert object into hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { const char *key = rht_obj(&hlt->ht, &list->rhead); key += params.key_offset; return rhltable_insert_key(hlt, key, list, params); } /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); void *ret; BUG_ON(ht->p.obj_hashfn); ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_fast(), but this function returns the * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); BUG_ON(ht->p.obj_hashfn); return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); } /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. * * Returns zero on success. */ static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; BUG_ON(!ht->p.obj_hashfn || !key); ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_key - lookup and insert object into hash table * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_key(), but this function returns the * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); if (he != obj) { struct rhlist_head __rcu **lpprev; pprev = &he->next; if (!rhlist) continue; do { lpprev = &list->next; list = rht_dereference_bucket(list->next, tbl, hash); } while (list && obj != &list->rhead); if (!list) continue; list = rht_dereference_bucket(list->next, tbl, hash); RCU_INIT_POINTER(*lpprev, list); err = 0; break; } obj = rht_dereference_bucket(obj->next, tbl, hash); err = 1; if (rhlist) { list = rht_dereference_bucket(list->next, tbl, hash); if (list) { RCU_INIT_POINTER(list->rhead.next, obj); obj = &list->rhead; err = 0; } } if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) schedule_work(&ht->run_work); err = 0; } return err; } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhashtable_remove_fast - remove object from hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { return __rhashtable_remove_fast(ht, obj, params, false); } /** * rhltable_remove - remove object from hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); } /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; /* Minimally, the old and new objects must have same hash * (which should mean identifiers are the same). */ hash = rht_head_hashfn(ht, tbl, obj_old, params); if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: return err; } /** * rhashtable_replace_fast - replace an object in hash table * @ht: hash table * @obj_old: pointer to hash head inside object being replaced * @obj_new: pointer to hash head inside object which is new * @params: hash table parameters * * Replacing an object doesn't affect the number of elements in the hash table * or bucket, so we don't need to worry about shrinking or expanding the * table here. * * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ static inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhltable_walk_enter - Initialise an iterator * @hlt: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { return rhashtable_walk_enter(&hlt->ht, iter); } /** * rhltable_free_and_destroy - free elements and destroy hash list table * @hlt: the hash list table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * See documentation for rhashtable_free_and_destroy. */ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void (*free_fn)(void *ptr, void *arg), void *arg) { return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { return rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */ |
| 87 87 87 87 87 87 87 87 87 87 87 87 1 86 1 86 17 86 76 58 2 75 2 2 86 2 11 14 86 3 1 2 3 47 1 67 46 48 48 48 45 36 76 25 76 26 76 36 22 36 87 87 87 1 87 86 70 67 2 19 65 12 67 1 48 36 36 36 86 50 28 28 28 28 28 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver for USB Mass Storage compliant devices * * Current development and maintenance by: * (c) 1999-2003 Matthew Dharm (mdharm-usb@one-eyed-alien.net) * * Developed with the assistance of: * (c) 2000 David L. Brown, Jr. (usb-storage@davidb.org) * (c) 2003-2009 Alan Stern (stern@rowland.harvard.edu) * * Initial work by: * (c) 1999 Michael Gee (michael@linuxspecific.com) * * usb_device_id support by Adam J. Richter (adam@yggdrasil.com): * (c) 2000 Yggdrasil Computing, Inc. * * This driver is based on the 'USB Mass Storage Class' document. This * describes in detail the protocol used to communicate with such * devices. Clearly, the designers had SCSI and ATAPI commands in * mind when they created this document. The commands are all very * similar to commands in the SCSI-II and ATAPI specifications. * * It is important to note that in a number of cases this class * exhibits class-specific exemptions from the USB specification. * Notably the usage of NAK, STALL and ACK differs from the norm, in * that they are used to communicate wait, failed and OK on commands. * * Also, for certain devices, the interrupt endpoint is used to convey * status of a command. */ #ifdef CONFIG_USB_STORAGE_DEBUG #define DEBUG #endif #include <linux/sched.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/utsname.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include "usb.h" #include "scsiglue.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "initializers.h" #include "sierra_ms.h" #include "option_ms.h" #if IS_ENABLED(CONFIG_USB_UAS) #include "uas-detect.h" #endif #define DRV_NAME "usb-storage" /* Some informational data */ MODULE_AUTHOR("Matthew Dharm <mdharm-usb@one-eyed-alien.net>"); MODULE_DESCRIPTION("USB Mass Storage driver for Linux"); MODULE_LICENSE("GPL"); static unsigned int delay_use = 1; module_param(delay_use, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(delay_use, "seconds to delay before using a new device"); static char quirks[128]; module_param_string(quirks, quirks, sizeof(quirks), S_IRUGO | S_IWUSR); MODULE_PARM_DESC(quirks, "supplemental list of device IDs and their quirks"); /* * The entries in this table correspond, line for line, * with the entries in usb_storage_usb_ids[], defined in usual-tables.c. */ /* *The vendor name should be kept at eight characters or less, and * the product name should be kept at 16 characters or less. If a device * has the US_FL_FIX_INQUIRY flag, then the vendor and product names * normally generated by a device through the INQUIRY response will be * taken from this list, and this is the reason for the above size * restriction. However, if the flag is not present, then you * are free to use as many characters as you like. */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } #define COMPLIANT_DEV UNUSUAL_DEV #define USUAL_DEV(use_protocol, use_transport) \ { \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ } #define UNUSUAL_VENDOR_INTF(idVendor, cl, sc, pr, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static const struct us_unusual_dev us_unusual_dev_list[] = { # include "unusual_devs.h" { } /* Terminating entry */ }; static const struct us_unusual_dev for_dynamic_ids = USUAL_DEV(USB_SC_SCSI, USB_PR_BULK); #undef UNUSUAL_DEV #undef COMPLIANT_DEV #undef USUAL_DEV #undef UNUSUAL_VENDOR_INTF #ifdef CONFIG_LOCKDEP static struct lock_class_key us_interface_key[USB_MAXINTERFACES]; static void us_set_lock_class(struct mutex *mutex, struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct usb_host_config *config = udev->actconfig; int i; for (i = 0; i < config->desc.bNumInterfaces; i++) { if (config->interface[i] == intf) break; } BUG_ON(i == config->desc.bNumInterfaces); lockdep_set_class(mutex, &us_interface_key[i]); } #else static void us_set_lock_class(struct mutex *mutex, struct usb_interface *intf) { } #endif #ifdef CONFIG_PM /* Minimal support for suspend and resume */ int usb_stor_suspend(struct usb_interface *iface, pm_message_t message) { struct us_data *us = usb_get_intfdata(iface); /* Wait until no command is running */ mutex_lock(&us->dev_mutex); if (us->suspend_resume_hook) (us->suspend_resume_hook)(us, US_SUSPEND); /* * When runtime PM is working, we'll set a flag to indicate * whether we should autoresume when a SCSI request arrives. */ mutex_unlock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_suspend); int usb_stor_resume(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); mutex_lock(&us->dev_mutex); if (us->suspend_resume_hook) (us->suspend_resume_hook)(us, US_RESUME); mutex_unlock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_resume); int usb_stor_reset_resume(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); /* Report the reset to the SCSI core */ usb_stor_report_bus_reset(us); /* * If any of the subdrivers implemented a reinitialization scheme, * this is where the callback would be invoked. */ return 0; } EXPORT_SYMBOL_GPL(usb_stor_reset_resume); #endif /* CONFIG_PM */ /* * The next two routines get called just before and just after * a USB port reset, whether from this driver or a different one. */ int usb_stor_pre_reset(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); /* Make sure no command runs during the reset */ mutex_lock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_pre_reset); int usb_stor_post_reset(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); /* Report the reset to the SCSI core */ usb_stor_report_bus_reset(us); /* * If any of the subdrivers implemented a reinitialization scheme, * this is where the callback would be invoked. */ mutex_unlock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_post_reset); /* * fill_inquiry_response takes an unsigned char array (which must * be at least 36 characters) and populates the vendor name, * product name, and revision fields. Then the array is copied * into the SCSI command's response buffer (oddly enough * called request_buffer). data_len contains the length of the * data array, which again must be at least 36. */ void fill_inquiry_response(struct us_data *us, unsigned char *data, unsigned int data_len) { if (data_len < 36) /* You lose. */ return; memset(data+8, ' ', 28); if (data[0]&0x20) { /* * USB device currently not connected. Return * peripheral qualifier 001b ("...however, the * physical device is not currently connected * to this logical unit") and leave vendor and * product identification empty. ("If the target * does store some of the INQUIRY data on the * device, it may return zeros or ASCII spaces * (20h) in those fields until the data is * available from the device."). */ } else { u16 bcdDevice = le16_to_cpu(us->pusb_dev->descriptor.bcdDevice); int n; n = strlen(us->unusual_dev->vendorName); memcpy(data+8, us->unusual_dev->vendorName, min(8, n)); n = strlen(us->unusual_dev->productName); memcpy(data+16, us->unusual_dev->productName, min(16, n)); data[32] = 0x30 + ((bcdDevice>>12) & 0x0F); data[33] = 0x30 + ((bcdDevice>>8) & 0x0F); data[34] = 0x30 + ((bcdDevice>>4) & 0x0F); data[35] = 0x30 + ((bcdDevice) & 0x0F); } usb_stor_set_xfer_buf(data, data_len, us->srb); } EXPORT_SYMBOL_GPL(fill_inquiry_response); static int usb_stor_control_thread(void * __us) { struct us_data *us = (struct us_data *)__us; struct Scsi_Host *host = us_to_host(us); struct scsi_cmnd *srb; for (;;) { usb_stor_dbg(us, "*** thread sleeping\n"); if (wait_for_completion_interruptible(&us->cmnd_ready)) break; usb_stor_dbg(us, "*** thread awakened\n"); /* lock the device pointers */ mutex_lock(&(us->dev_mutex)); /* lock access to the state */ scsi_lock(host); /* When we are called with no command pending, we're done */ srb = us->srb; if (srb == NULL) { scsi_unlock(host); mutex_unlock(&us->dev_mutex); usb_stor_dbg(us, "-- exiting\n"); break; } /* has the command timed out *already* ? */ if (test_bit(US_FLIDX_TIMED_OUT, &us->dflags)) { srb->result = DID_ABORT << 16; goto SkipForAbort; } scsi_unlock(host); /* * reject the command if the direction indicator * is UNKNOWN */ if (srb->sc_data_direction == DMA_BIDIRECTIONAL) { usb_stor_dbg(us, "UNKNOWN data direction\n"); srb->result = DID_ERROR << 16; } /* * reject if target != 0 or if LUN is higher than * the maximum known LUN */ else if (srb->device->id && !(us->fflags & US_FL_SCM_MULT_TARG)) { usb_stor_dbg(us, "Bad target number (%d:%llu)\n", srb->device->id, srb->device->lun); srb->result = DID_BAD_TARGET << 16; } else if (srb->device->lun > us->max_lun) { usb_stor_dbg(us, "Bad LUN (%d:%llu)\n", srb->device->id, srb->device->lun); srb->result = DID_BAD_TARGET << 16; } /* * Handle those devices which need us to fake * their inquiry data */ else if ((srb->cmnd[0] == INQUIRY) && (us->fflags & US_FL_FIX_INQUIRY)) { unsigned char data_ptr[36] = { 0x00, 0x80, 0x02, 0x02, 0x1F, 0x00, 0x00, 0x00}; usb_stor_dbg(us, "Faking INQUIRY command\n"); fill_inquiry_response(us, data_ptr, 36); srb->result = SAM_STAT_GOOD; } /* we've got a command, let's do it! */ else { US_DEBUG(usb_stor_show_command(us, srb)); us->proto_handler(srb, us); usb_mark_last_busy(us->pusb_dev); } /* lock access to the state */ scsi_lock(host); /* was the command aborted? */ if (srb->result == DID_ABORT << 16) { SkipForAbort: usb_stor_dbg(us, "scsi command aborted\n"); srb = NULL; /* Don't call scsi_done() */ } /* * If an abort request was received we need to signal that * the abort has finished. The proper test for this is * the TIMED_OUT flag, not srb->result == DID_ABORT, because * the timeout might have occurred after the command had * already completed with a different result code. */ if (test_bit(US_FLIDX_TIMED_OUT, &us->dflags)) { complete(&(us->notify)); /* Allow USB transfers to resume */ clear_bit(US_FLIDX_ABORTING, &us->dflags); clear_bit(US_FLIDX_TIMED_OUT, &us->dflags); } /* finished working on this command */ us->srb = NULL; scsi_unlock(host); /* unlock the device pointers */ mutex_unlock(&us->dev_mutex); /* now that the locks are released, notify the SCSI core */ if (srb) { usb_stor_dbg(us, "scsi cmd done, result=0x%x\n", srb->result); scsi_done_direct(srb); } } /* for (;;) */ /* Wait until we are told to stop */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; schedule(); } __set_current_state(TASK_RUNNING); return 0; } /*********************************************************************** * Device probing and disconnecting ***********************************************************************/ /* Associate our private data with the USB device */ static int associate_dev(struct us_data *us, struct usb_interface *intf) { /* Fill in the device-related fields */ us->pusb_dev = interface_to_usbdev(intf); us->pusb_intf = intf; us->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; usb_stor_dbg(us, "Vendor: 0x%04x, Product: 0x%04x, Revision: 0x%04x\n", le16_to_cpu(us->pusb_dev->descriptor.idVendor), le16_to_cpu(us->pusb_dev->descriptor.idProduct), le16_to_cpu(us->pusb_dev->descriptor.bcdDevice)); usb_stor_dbg(us, "Interface Subclass: 0x%02x, Protocol: 0x%02x\n", intf->cur_altsetting->desc.bInterfaceSubClass, intf->cur_altsetting->desc.bInterfaceProtocol); /* Store our private data in the interface */ usb_set_intfdata(intf, us); /* Allocate the control/setup and DMA-mapped buffers */ us->cr = kmalloc(sizeof(*us->cr), GFP_KERNEL); if (!us->cr) return -ENOMEM; us->iobuf = usb_alloc_coherent(us->pusb_dev, US_IOBUF_SIZE, GFP_KERNEL, &us->iobuf_dma); if (!us->iobuf) { usb_stor_dbg(us, "I/O buffer allocation failed\n"); return -ENOMEM; } return 0; } /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) /* Adjust device flags based on the "quirks=" module parameter */ void usb_stor_adjust_quirks(struct usb_device *udev, unsigned long *fflags) { char *p; u16 vid = le16_to_cpu(udev->descriptor.idVendor); u16 pid = le16_to_cpu(udev->descriptor.idProduct); unsigned f = 0; unsigned int mask = (US_FL_SANE_SENSE | US_FL_BAD_SENSE | US_FL_FIX_CAPACITY | US_FL_IGNORE_UAS | US_FL_CAPACITY_HEURISTICS | US_FL_IGNORE_DEVICE | US_FL_NOT_LOCKABLE | US_FL_MAX_SECTORS_64 | US_FL_CAPACITY_OK | US_FL_IGNORE_RESIDUE | US_FL_SINGLE_LUN | US_FL_NO_WP_DETECT | US_FL_NO_READ_DISC_INFO | US_FL_NO_READ_CAPACITY_16 | US_FL_INITIAL_READ10 | US_FL_WRITE_CACHE | US_FL_NO_ATA_1X | US_FL_NO_REPORT_OPCODES | US_FL_MAX_SECTORS_240 | US_FL_NO_REPORT_LUNS | US_FL_ALWAYS_SYNC); p = quirks; while (*p) { /* Each entry consists of VID:PID:flags */ if (vid == simple_strtoul(p, &p, 16) && *p == ':' && pid == simple_strtoul(p+1, &p, 16) && *p == ':') break; /* Move forward to the next entry */ while (*p) { if (*p++ == ',') break; } } if (!*p) /* No match */ return; /* Collect the flags */ while (*++p && *p != ',') { switch (TOLOWER(*p)) { case 'a': f |= US_FL_SANE_SENSE; break; case 'b': f |= US_FL_BAD_SENSE; break; case 'c': f |= US_FL_FIX_CAPACITY; break; case 'd': f |= US_FL_NO_READ_DISC_INFO; break; case 'e': f |= US_FL_NO_READ_CAPACITY_16; break; case 'f': f |= US_FL_NO_REPORT_OPCODES; break; case 'g': f |= US_FL_MAX_SECTORS_240; break; case 'h': f |= US_FL_CAPACITY_HEURISTICS; break; case 'i': f |= US_FL_IGNORE_DEVICE; break; case 'j': f |= US_FL_NO_REPORT_LUNS; break; case 'k': f |= US_FL_NO_SAME; break; case 'l': f |= US_FL_NOT_LOCKABLE; break; case 'm': f |= US_FL_MAX_SECTORS_64; break; case 'n': f |= US_FL_INITIAL_READ10; break; case 'o': f |= US_FL_CAPACITY_OK; break; case 'p': f |= US_FL_WRITE_CACHE; break; case 'r': f |= US_FL_IGNORE_RESIDUE; break; case 's': f |= US_FL_SINGLE_LUN; break; case 't': f |= US_FL_NO_ATA_1X; break; case 'u': f |= US_FL_IGNORE_UAS; break; case 'w': f |= US_FL_NO_WP_DETECT; break; case 'y': f |= US_FL_ALWAYS_SYNC; break; /* Ignore unrecognized flag characters */ } } *fflags = (*fflags & ~mask) | f; } EXPORT_SYMBOL_GPL(usb_stor_adjust_quirks); /* Get the unusual_devs entries and the string descriptors */ static int get_device_info(struct us_data *us, const struct usb_device_id *id, const struct us_unusual_dev *unusual_dev) { struct usb_device *dev = us->pusb_dev; struct usb_interface_descriptor *idesc = &us->pusb_intf->cur_altsetting->desc; struct device *pdev = &us->pusb_intf->dev; /* Store the entries */ us->unusual_dev = unusual_dev; us->subclass = (unusual_dev->useProtocol == USB_SC_DEVICE) ? idesc->bInterfaceSubClass : unusual_dev->useProtocol; us->protocol = (unusual_dev->useTransport == USB_PR_DEVICE) ? idesc->bInterfaceProtocol : unusual_dev->useTransport; us->fflags = id->driver_info; usb_stor_adjust_quirks(us->pusb_dev, &us->fflags); if (us->fflags & US_FL_IGNORE_DEVICE) { dev_info(pdev, "device ignored\n"); return -ENODEV; } /* * This flag is only needed when we're in high-speed, so let's * disable it if we're in full-speed */ if (dev->speed != USB_SPEED_HIGH) us->fflags &= ~US_FL_GO_SLOW; if (us->fflags) dev_info(pdev, "Quirks match for vid %04x pid %04x: %lx\n", le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct), us->fflags); /* * Log a message if a non-generic unusual_dev entry contains an * unnecessary subclass or protocol override. This may stimulate * reports from users that will help us remove unneeded entries * from the unusual_devs.h table. */ if (id->idVendor || id->idProduct) { static const char *msgs[3] = { "an unneeded SubClass entry", "an unneeded Protocol entry", "unneeded SubClass and Protocol entries"}; struct usb_device_descriptor *ddesc = &dev->descriptor; int msg = -1; if (unusual_dev->useProtocol != USB_SC_DEVICE && us->subclass == idesc->bInterfaceSubClass) msg += 1; if (unusual_dev->useTransport != USB_PR_DEVICE && us->protocol == idesc->bInterfaceProtocol) msg += 2; if (msg >= 0 && !(us->fflags & US_FL_NEED_OVERRIDE)) dev_notice(pdev, "This device " "(%04x,%04x,%04x S %02x P %02x)" " has %s in unusual_devs.h (kernel" " %s)\n" " Please send a copy of this message to " "<linux-usb@vger.kernel.org> and " "<usb-storage@lists.one-eyed-alien.net>\n", le16_to_cpu(ddesc->idVendor), le16_to_cpu(ddesc->idProduct), le16_to_cpu(ddesc->bcdDevice), idesc->bInterfaceSubClass, idesc->bInterfaceProtocol, msgs[msg], utsname()->release); } return 0; } /* Get the transport settings */ static void get_transport(struct us_data *us) { switch (us->protocol) { case USB_PR_CB: us->transport_name = "Control/Bulk"; us->transport = usb_stor_CB_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 7; break; case USB_PR_CBI: us->transport_name = "Control/Bulk/Interrupt"; us->transport = usb_stor_CB_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 7; break; case USB_PR_BULK: us->transport_name = "Bulk"; us->transport = usb_stor_Bulk_transport; us->transport_reset = usb_stor_Bulk_reset; break; } } /* Get the protocol settings */ static void get_protocol(struct us_data *us) { switch (us->subclass) { case USB_SC_RBC: us->protocol_name = "Reduced Block Commands (RBC)"; us->proto_handler = usb_stor_transparent_scsi_command; break; case USB_SC_8020: us->protocol_name = "8020i"; us->proto_handler = usb_stor_pad12_command; us->max_lun = 0; break; case USB_SC_QIC: us->protocol_name = "QIC-157"; us->proto_handler = usb_stor_pad12_command; us->max_lun = 0; break; case USB_SC_8070: us->protocol_name = "8070i"; us->proto_handler = usb_stor_pad12_command; us->max_lun = 0; break; case USB_SC_SCSI: us->protocol_name = "Transparent SCSI"; us->proto_handler = usb_stor_transparent_scsi_command; break; case USB_SC_UFI: us->protocol_name = "Uniform Floppy Interface (UFI)"; us->proto_handler = usb_stor_ufi_command; break; } } /* Get the pipe settings */ static int get_pipes(struct us_data *us) { struct usb_host_interface *alt = us->pusb_intf->cur_altsetting; struct usb_endpoint_descriptor *ep_in; struct usb_endpoint_descriptor *ep_out; struct usb_endpoint_descriptor *ep_int; int res; /* * Find the first endpoint of each type we need. * We are expecting a minimum of 2 endpoints - in and out (bulk). * An optional interrupt-in is OK (necessary for CBI protocol). * We will ignore any others. */ res = usb_find_common_endpoints(alt, &ep_in, &ep_out, NULL, NULL); if (res) { usb_stor_dbg(us, "bulk endpoints not found\n"); return res; } res = usb_find_int_in_endpoint(alt, &ep_int); if (res && us->protocol == USB_PR_CBI) { usb_stor_dbg(us, "interrupt endpoint not found\n"); return res; } /* Calculate and store the pipe values */ us->send_ctrl_pipe = usb_sndctrlpipe(us->pusb_dev, 0); us->recv_ctrl_pipe = usb_rcvctrlpipe(us->pusb_dev, 0); us->send_bulk_pipe = usb_sndbulkpipe(us->pusb_dev, usb_endpoint_num(ep_out)); us->recv_bulk_pipe = usb_rcvbulkpipe(us->pusb_dev, usb_endpoint_num(ep_in)); if (ep_int) { us->recv_intr_pipe = usb_rcvintpipe(us->pusb_dev, usb_endpoint_num(ep_int)); us->ep_bInterval = ep_int->bInterval; } return 0; } /* Initialize all the dynamic resources we need */ static int usb_stor_acquire_resources(struct us_data *us) { int p; struct task_struct *th; us->current_urb = usb_alloc_urb(0, GFP_KERNEL); if (!us->current_urb) return -ENOMEM; /* * Just before we start our control thread, initialize * the device if it needs initialization */ if (us->unusual_dev->initFunction) { p = us->unusual_dev->initFunction(us); if (p) return p; } /* Start up our control thread */ th = kthread_run(usb_stor_control_thread, us, "usb-storage"); if (IS_ERR(th)) { dev_warn(&us->pusb_intf->dev, "Unable to start control thread\n"); return PTR_ERR(th); } us->ctl_thread = th; return 0; } /* Release all our dynamic resources */ static void usb_stor_release_resources(struct us_data *us) { /* * Tell the control thread to exit. The SCSI host must * already have been removed and the DISCONNECTING flag set * so that we won't accept any more commands. */ usb_stor_dbg(us, "-- sending exit command to thread\n"); complete(&us->cmnd_ready); if (us->ctl_thread) kthread_stop(us->ctl_thread); /* Call the destructor routine, if it exists */ if (us->extra_destructor) { usb_stor_dbg(us, "-- calling extra_destructor()\n"); us->extra_destructor(us->extra); } /* Free the extra data and the URB */ kfree(us->extra); usb_free_urb(us->current_urb); } /* Dissociate from the USB device */ static void dissociate_dev(struct us_data *us) { /* Free the buffers */ kfree(us->cr); usb_free_coherent(us->pusb_dev, US_IOBUF_SIZE, us->iobuf, us->iobuf_dma); /* Remove our private data from the interface */ usb_set_intfdata(us->pusb_intf, NULL); } /* * First stage of disconnect processing: stop SCSI scanning, * remove the host, and stop accepting new commands */ static void quiesce_and_remove_host(struct us_data *us) { struct Scsi_Host *host = us_to_host(us); /* If the device is really gone, cut short reset delays */ if (us->pusb_dev->state == USB_STATE_NOTATTACHED) { set_bit(US_FLIDX_DISCONNECTING, &us->dflags); wake_up(&us->delay_wait); } /* * Prevent SCSI scanning (if it hasn't started yet) * or wait for the SCSI-scanning routine to stop. */ cancel_delayed_work_sync(&us->scan_dwork); /* Balance autopm calls if scanning was cancelled */ if (test_bit(US_FLIDX_SCAN_PENDING, &us->dflags)) usb_autopm_put_interface_no_suspend(us->pusb_intf); /* * Removing the host will perform an orderly shutdown: caches * synchronized, disks spun down, etc. */ scsi_remove_host(host); /* * Prevent any new commands from being accepted and cut short * reset delays. */ scsi_lock(host); set_bit(US_FLIDX_DISCONNECTING, &us->dflags); scsi_unlock(host); wake_up(&us->delay_wait); } /* Second stage of disconnect processing: deallocate all resources */ static void release_everything(struct us_data *us) { usb_stor_release_resources(us); dissociate_dev(us); /* * Drop our reference to the host; the SCSI core will free it * (and "us" along with it) when the refcount becomes 0. */ scsi_host_put(us_to_host(us)); } /* Delayed-work routine to carry out SCSI-device scanning */ static void usb_stor_scan_dwork(struct work_struct *work) { struct us_data *us = container_of(work, struct us_data, scan_dwork.work); struct device *dev = &us->pusb_intf->dev; dev_dbg(dev, "starting scan\n"); /* For bulk-only devices, determine the max LUN value */ if (us->protocol == USB_PR_BULK && !(us->fflags & US_FL_SINGLE_LUN) && !(us->fflags & US_FL_SCM_MULT_TARG)) { mutex_lock(&us->dev_mutex); us->max_lun = usb_stor_Bulk_max_lun(us); /* * Allow proper scanning of devices that present more than 8 LUNs * While not affecting other devices that may need the previous * behavior */ if (us->max_lun >= 8) us_to_host(us)->max_lun = us->max_lun+1; mutex_unlock(&us->dev_mutex); } scsi_scan_host(us_to_host(us)); dev_dbg(dev, "scan complete\n"); /* Should we unbind if no devices were detected? */ usb_autopm_put_interface(us->pusb_intf); clear_bit(US_FLIDX_SCAN_PENDING, &us->dflags); } static unsigned int usb_stor_sg_tablesize(struct usb_interface *intf) { struct usb_device *usb_dev = interface_to_usbdev(intf); if (usb_dev->bus->sg_tablesize) { return usb_dev->bus->sg_tablesize; } return SG_ALL; } /* First part of general USB mass-storage probing */ int usb_stor_probe1(struct us_data **pus, struct usb_interface *intf, const struct usb_device_id *id, const struct us_unusual_dev *unusual_dev, const struct scsi_host_template *sht) { struct Scsi_Host *host; struct us_data *us; int result; dev_info(&intf->dev, "USB Mass Storage device detected\n"); /* * Ask the SCSI layer to allocate a host structure, with extra * space at the end for our private us_data structure. */ host = scsi_host_alloc(sht, sizeof(*us)); if (!host) { dev_warn(&intf->dev, "Unable to allocate the scsi host\n"); return -ENOMEM; } /* * Allow 16-byte CDBs and thus > 2TB */ host->max_cmd_len = 16; host->sg_tablesize = usb_stor_sg_tablesize(intf); *pus = us = host_to_us(host); mutex_init(&(us->dev_mutex)); us_set_lock_class(&us->dev_mutex, intf); init_completion(&us->cmnd_ready); init_completion(&(us->notify)); init_waitqueue_head(&us->delay_wait); INIT_DELAYED_WORK(&us->scan_dwork, usb_stor_scan_dwork); /* Associate the us_data structure with the USB device */ result = associate_dev(us, intf); if (result) goto BadDevice; /* Get the unusual_devs entries and the descriptors */ result = get_device_info(us, id, unusual_dev); if (result) goto BadDevice; /* Get standard transport and protocol settings */ get_transport(us); get_protocol(us); /* * Give the caller a chance to fill in specialized transport * or protocol settings. */ return 0; BadDevice: usb_stor_dbg(us, "storage_probe() failed\n"); release_everything(us); return result; } EXPORT_SYMBOL_GPL(usb_stor_probe1); /* Second part of general USB mass-storage probing */ int usb_stor_probe2(struct us_data *us) { int result; struct device *dev = &us->pusb_intf->dev; /* Make sure the transport and protocol have both been set */ if (!us->transport || !us->proto_handler) { result = -ENXIO; goto BadDevice; } usb_stor_dbg(us, "Transport: %s\n", us->transport_name); usb_stor_dbg(us, "Protocol: %s\n", us->protocol_name); if (us->fflags & US_FL_SCM_MULT_TARG) { /* * SCM eUSCSI bridge devices can have different numbers * of LUNs on different targets; allow all to be probed. */ us->max_lun = 7; /* The eUSCSI itself has ID 7, so avoid scanning that */ us_to_host(us)->this_id = 7; /* max_id is 8 initially, so no need to set it here */ } else { /* In the normal case there is only a single target */ us_to_host(us)->max_id = 1; /* * Like Windows, we won't store the LUN bits in CDB[1] for * SCSI-2 devices using the Bulk-Only transport (even though * this violates the SCSI spec). */ if (us->transport == usb_stor_Bulk_transport) us_to_host(us)->no_scsi2_lun_in_cdb = 1; } /* fix for single-lun devices */ if (us->fflags & US_FL_SINGLE_LUN) us->max_lun = 0; /* Find the endpoints and calculate pipe values */ result = get_pipes(us); if (result) goto BadDevice; /* * If the device returns invalid data for the first READ(10) * command, indicate the command should be retried. */ if (us->fflags & US_FL_INITIAL_READ10) set_bit(US_FLIDX_REDO_READ10, &us->dflags); /* Acquire all the other resources and add the host */ result = usb_stor_acquire_resources(us); if (result) goto BadDevice; usb_autopm_get_interface_no_resume(us->pusb_intf); snprintf(us->scsi_name, sizeof(us->scsi_name), "usb-storage %s", dev_name(&us->pusb_intf->dev)); result = scsi_add_host(us_to_host(us), dev); if (result) { dev_warn(dev, "Unable to add the scsi host\n"); goto HostAddErr; } /* Submit the delayed_work for SCSI-device scanning */ set_bit(US_FLIDX_SCAN_PENDING, &us->dflags); if (delay_use > 0) dev_dbg(dev, "waiting for device to settle before scanning\n"); queue_delayed_work(system_freezable_wq, &us->scan_dwork, delay_use * HZ); return 0; /* We come here if there are any problems */ HostAddErr: usb_autopm_put_interface_no_suspend(us->pusb_intf); BadDevice: usb_stor_dbg(us, "storage_probe() failed\n"); release_everything(us); return result; } EXPORT_SYMBOL_GPL(usb_stor_probe2); /* Handle a USB mass-storage disconnect */ void usb_stor_disconnect(struct usb_interface *intf) { struct us_data *us = usb_get_intfdata(intf); quiesce_and_remove_host(us); release_everything(us); } EXPORT_SYMBOL_GPL(usb_stor_disconnect); static struct scsi_host_template usb_stor_host_template; /* The main probe routine for standard devices */ static int storage_probe(struct usb_interface *intf, const struct usb_device_id *id) { const struct us_unusual_dev *unusual_dev; struct us_data *us; int result; int size; /* If uas is enabled and this device can do uas then ignore it. */ #if IS_ENABLED(CONFIG_USB_UAS) if (uas_use_uas_driver(intf, id, NULL)) return -ENXIO; #endif /* * If the device isn't standard (is handled by a subdriver * module) then don't accept it. */ if (usb_usual_ignore_device(intf)) return -ENXIO; /* * Call the general probe procedures. * * The unusual_dev_list array is parallel to the usb_storage_usb_ids * table, so we use the index of the id entry to find the * corresponding unusual_devs entry. */ size = ARRAY_SIZE(us_unusual_dev_list); if (id >= usb_storage_usb_ids && id < usb_storage_usb_ids + size) { unusual_dev = (id - usb_storage_usb_ids) + us_unusual_dev_list; } else { unusual_dev = &for_dynamic_ids; dev_dbg(&intf->dev, "Use Bulk-Only transport with the Transparent SCSI protocol for dynamic id: 0x%04x 0x%04x\n", id->idVendor, id->idProduct); } result = usb_stor_probe1(&us, intf, id, unusual_dev, &usb_stor_host_template); if (result) return result; /* No special transport or protocol settings in the main module */ result = usb_stor_probe2(us); return result; } static struct usb_driver usb_storage_driver = { .name = DRV_NAME, .probe = storage_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = usb_storage_usb_ids, .supports_autosuspend = 1, .soft_unbind = 1, }; module_usb_stor_driver(usb_storage_driver, usb_stor_host_template, DRV_NAME); |
| 41 41 41 8 33 41 41 41 41 41 41 41 41 41 41 480 41 480 480 480 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 | // SPDX-License-Identifier: GPL-2.0-or-later /* */ #include <linux/gfp.h> #include <linux/init.h> #include <linux/ratelimit.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <linux/slab.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "usbaudio.h" #include "helper.h" #include "card.h" #include "endpoint.h" #include "pcm.h" #include "clock.h" #include "quirks.h" enum { EP_STATE_STOPPED, EP_STATE_RUNNING, EP_STATE_STOPPING, }; /* interface refcounting */ struct snd_usb_iface_ref { unsigned char iface; bool need_setup; int opened; int altset; struct list_head list; }; /* clock refcounting */ struct snd_usb_clock_ref { unsigned char clock; atomic_t locked; int opened; int rate; bool need_setup; struct list_head list; }; /* * snd_usb_endpoint is a model that abstracts everything related to an * USB endpoint and its streaming. * * There are functions to activate and deactivate the streaming URBs and * optional callbacks to let the pcm logic handle the actual content of the * packets for playback and record. Thus, the bus streaming and the audio * handlers are fully decoupled. * * There are two different types of endpoints in audio applications. * * SND_USB_ENDPOINT_TYPE_DATA handles full audio data payload for both * inbound and outbound traffic. * * SND_USB_ENDPOINT_TYPE_SYNC endpoints are for inbound traffic only and * expect the payload to carry Q10.14 / Q16.16 formatted sync information * (3 or 4 bytes). * * Each endpoint has to be configured prior to being used by calling * snd_usb_endpoint_set_params(). * * The model incorporates a reference counting, so that multiple users * can call snd_usb_endpoint_start() and snd_usb_endpoint_stop(), and * only the first user will effectively start the URBs, and only the last * one to stop it will tear the URBs down again. */ /* * convert a sampling rate into our full speed format (fs/1000 in Q16.16) * this will overflow at approx 524 kHz */ static inline unsigned get_usb_full_speed_rate(unsigned int rate) { return ((rate << 13) + 62) / 125; } /* * convert a sampling rate into USB high speed format (fs/8000 in Q16.16) * this will overflow at approx 4 MHz */ static inline unsigned get_usb_high_speed_rate(unsigned int rate) { return ((rate << 10) + 62) / 125; } /* * release a urb data */ static void release_urb_ctx(struct snd_urb_ctx *u) { if (u->urb && u->buffer_size) usb_free_coherent(u->ep->chip->dev, u->buffer_size, u->urb->transfer_buffer, u->urb->transfer_dma); usb_free_urb(u->urb); u->urb = NULL; u->buffer_size = 0; } static const char *usb_error_string(int err) { switch (err) { case -ENODEV: return "no device"; case -ENOENT: return "endpoint not enabled"; case -EPIPE: return "endpoint stalled"; case -ENOSPC: return "not enough bandwidth"; case -ESHUTDOWN: return "device disabled"; case -EHOSTUNREACH: return "device suspended"; case -EINVAL: case -EAGAIN: case -EFBIG: case -EMSGSIZE: return "internal error"; default: return "unknown error"; } } static inline bool ep_state_running(struct snd_usb_endpoint *ep) { return atomic_read(&ep->state) == EP_STATE_RUNNING; } static inline bool ep_state_update(struct snd_usb_endpoint *ep, int old, int new) { return atomic_try_cmpxchg(&ep->state, &old, new); } /** * snd_usb_endpoint_implicit_feedback_sink: Report endpoint usage type * * @ep: The snd_usb_endpoint * * Determine whether an endpoint is driven by an implicit feedback * data endpoint source. */ int snd_usb_endpoint_implicit_feedback_sink(struct snd_usb_endpoint *ep) { return ep->implicit_fb_sync && usb_pipeout(ep->pipe); } /* * Return the number of samples to be sent in the next packet * for streaming based on information derived from sync endpoints * * This won't be used for implicit feedback which takes the packet size * returned from the sync source */ static int slave_next_packet_size(struct snd_usb_endpoint *ep, unsigned int avail) { unsigned long flags; unsigned int phase; int ret; if (ep->fill_max) return ep->maxframesize; spin_lock_irqsave(&ep->lock, flags); phase = (ep->phase & 0xffff) + (ep->freqm << ep->datainterval); ret = min(phase >> 16, ep->maxframesize); if (avail && ret >= avail) ret = -EAGAIN; else ep->phase = phase; spin_unlock_irqrestore(&ep->lock, flags); return ret; } /* * Return the number of samples to be sent in the next packet * for adaptive and synchronous endpoints */ static int next_packet_size(struct snd_usb_endpoint *ep, unsigned int avail) { unsigned int sample_accum; int ret; if (ep->fill_max) return ep->maxframesize; sample_accum = ep->sample_accum + ep->sample_rem; if (sample_accum >= ep->pps) { sample_accum -= ep->pps; ret = ep->packsize[1]; } else { ret = ep->packsize[0]; } if (avail && ret >= avail) ret = -EAGAIN; else ep->sample_accum = sample_accum; return ret; } /* * snd_usb_endpoint_next_packet_size: Return the number of samples to be sent * in the next packet * * If the size is equal or exceeds @avail, don't proceed but return -EAGAIN * Exception: @avail = 0 for skipping the check. */ int snd_usb_endpoint_next_packet_size(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx, int idx, unsigned int avail) { unsigned int packet; packet = ctx->packet_size[idx]; if (packet) { if (avail && packet >= avail) return -EAGAIN; return packet; } if (ep->sync_source) return slave_next_packet_size(ep, avail); else return next_packet_size(ep, avail); } static void call_retire_callback(struct snd_usb_endpoint *ep, struct urb *urb) { struct snd_usb_substream *data_subs; data_subs = READ_ONCE(ep->data_subs); if (data_subs && ep->retire_data_urb) ep->retire_data_urb(data_subs, urb); } static void retire_outbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *urb_ctx) { call_retire_callback(ep, urb_ctx->urb); } static void snd_usb_handle_sync_urb(struct snd_usb_endpoint *ep, struct snd_usb_endpoint *sender, const struct urb *urb); static void retire_inbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *urb_ctx) { struct urb *urb = urb_ctx->urb; struct snd_usb_endpoint *sync_sink; if (unlikely(ep->skip_packets > 0)) { ep->skip_packets--; return; } sync_sink = READ_ONCE(ep->sync_sink); if (sync_sink) snd_usb_handle_sync_urb(sync_sink, ep, urb); call_retire_callback(ep, urb); } static inline bool has_tx_length_quirk(struct snd_usb_audio *chip) { return chip->quirk_flags & QUIRK_FLAG_TX_LENGTH; } static void prepare_silent_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx) { struct urb *urb = ctx->urb; unsigned int offs = 0; unsigned int extra = 0; __le32 packet_length; int i; /* For tx_length_quirk, put packet length at start of packet */ if (has_tx_length_quirk(ep->chip)) extra = sizeof(packet_length); for (i = 0; i < ctx->packets; ++i) { unsigned int offset; unsigned int length; int counts; counts = snd_usb_endpoint_next_packet_size(ep, ctx, i, 0); length = counts * ep->stride; /* number of silent bytes */ offset = offs * ep->stride + extra * i; urb->iso_frame_desc[i].offset = offset; urb->iso_frame_desc[i].length = length + extra; if (extra) { packet_length = cpu_to_le32(length); memcpy(urb->transfer_buffer + offset, &packet_length, sizeof(packet_length)); } memset(urb->transfer_buffer + offset + extra, ep->silence_value, length); offs += counts; } urb->number_of_packets = ctx->packets; urb->transfer_buffer_length = offs * ep->stride + ctx->packets * extra; ctx->queued = 0; } /* * Prepare a PLAYBACK urb for submission to the bus. */ static int prepare_outbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx, bool in_stream_lock) { struct urb *urb = ctx->urb; unsigned char *cp = urb->transfer_buffer; struct snd_usb_substream *data_subs; urb->dev = ep->chip->dev; /* we need to set this at each time */ switch (ep->type) { case SND_USB_ENDPOINT_TYPE_DATA: data_subs = READ_ONCE(ep->data_subs); if (data_subs && ep->prepare_data_urb) return ep->prepare_data_urb(data_subs, urb, in_stream_lock); /* no data provider, so send silence */ prepare_silent_urb(ep, ctx); break; case SND_USB_ENDPOINT_TYPE_SYNC: if (snd_usb_get_speed(ep->chip->dev) >= USB_SPEED_HIGH) { /* * fill the length and offset of each urb descriptor. * the fixed 12.13 frequency is passed as 16.16 through the pipe. */ urb->iso_frame_desc[0].length = 4; urb->iso_frame_desc[0].offset = 0; cp[0] = ep->freqn; cp[1] = ep->freqn >> 8; cp[2] = ep->freqn >> 16; cp[3] = ep->freqn >> 24; } else { /* * fill the length and offset of each urb descriptor. * the fixed 10.14 frequency is passed through the pipe. */ urb->iso_frame_desc[0].length = 3; urb->iso_frame_desc[0].offset = 0; cp[0] = ep->freqn >> 2; cp[1] = ep->freqn >> 10; cp[2] = ep->freqn >> 18; } break; } return 0; } /* * Prepare a CAPTURE or SYNC urb for submission to the bus. */ static int prepare_inbound_urb(struct snd_usb_endpoint *ep, struct snd_urb_ctx *urb_ctx) { int i, offs; struct urb *urb = urb_ctx->urb; urb->dev = ep->chip->dev; /* we need to set this at each time */ switch (ep->type) { case SND_USB_ENDPOINT_TYPE_DATA: offs = 0; for (i = 0; i < urb_ctx->packets; i++) { urb->iso_frame_desc[i].offset = offs; urb->iso_frame_desc[i].length = ep->curpacksize; offs += ep->curpacksize; } urb->transfer_buffer_length = offs; urb->number_of_packets = urb_ctx->packets; break; case SND_USB_ENDPOINT_TYPE_SYNC: urb->iso_frame_desc[0].length = min(4u, ep->syncmaxsize); urb->iso_frame_desc[0].offset = 0; break; } return 0; } /* notify an error as XRUN to the assigned PCM data substream */ static void notify_xrun(struct snd_usb_endpoint *ep) { struct snd_usb_substream *data_subs; data_subs = READ_ONCE(ep->data_subs); if (data_subs && data_subs->pcm_substream) snd_pcm_stop_xrun(data_subs->pcm_substream); } static struct snd_usb_packet_info * next_packet_fifo_enqueue(struct snd_usb_endpoint *ep) { struct snd_usb_packet_info *p; p = ep->next_packet + (ep->next_packet_head + ep->next_packet_queued) % ARRAY_SIZE(ep->next_packet); ep->next_packet_queued++; return p; } static struct snd_usb_packet_info * next_packet_fifo_dequeue(struct snd_usb_endpoint *ep) { struct snd_usb_packet_info *p; p = ep->next_packet + ep->next_packet_head; ep->next_packet_head++; ep->next_packet_head %= ARRAY_SIZE(ep->next_packet); ep->next_packet_queued--; return p; } static void push_back_to_ready_list(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx) { unsigned long flags; spin_lock_irqsave(&ep->lock, flags); list_add_tail(&ctx->ready_list, &ep->ready_playback_urbs); spin_unlock_irqrestore(&ep->lock, flags); } /* * Send output urbs that have been prepared previously. URBs are dequeued * from ep->ready_playback_urbs and in case there aren't any available * or there are no packets that have been prepared, this function does * nothing. * * The reason why the functionality of sending and preparing URBs is separated * is that host controllers don't guarantee the order in which they return * inbound and outbound packets to their submitters. * * This function is used both for implicit feedback endpoints and in low- * latency playback mode. */ int snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, bool in_stream_lock) { bool implicit_fb = snd_usb_endpoint_implicit_feedback_sink(ep); while (ep_state_running(ep)) { unsigned long flags; struct snd_usb_packet_info *packet; struct snd_urb_ctx *ctx = NULL; int err, i; spin_lock_irqsave(&ep->lock, flags); if ((!implicit_fb || ep->next_packet_queued > 0) && !list_empty(&ep->ready_playback_urbs)) { /* take URB out of FIFO */ ctx = list_first_entry(&ep->ready_playback_urbs, struct snd_urb_ctx, ready_list); list_del_init(&ctx->ready_list); if (implicit_fb) packet = next_packet_fifo_dequeue(ep); } spin_unlock_irqrestore(&ep->lock, flags); if (ctx == NULL) break; /* copy over the length information */ if (implicit_fb) { for (i = 0; i < packet->packets; i++) ctx->packet_size[i] = packet->packet_size[i]; } /* call the data handler to fill in playback data */ err = prepare_outbound_urb(ep, ctx, in_stream_lock); /* can be stopped during prepare callback */ if (unlikely(!ep_state_running(ep))) break; if (err < 0) { /* push back to ready list again for -EAGAIN */ if (err == -EAGAIN) { push_back_to_ready_list(ep, ctx); break; } if (!in_stream_lock) notify_xrun(ep); return -EPIPE; } if (!atomic_read(&ep->chip->shutdown)) err = usb_submit_urb(ctx->urb, GFP_ATOMIC); else err = -ENODEV; if (err < 0) { if (!atomic_read(&ep->chip->shutdown)) { usb_audio_err(ep->chip, "Unable to submit urb #%d: %d at %s\n", ctx->index, err, __func__); if (!in_stream_lock) notify_xrun(ep); } return -EPIPE; } set_bit(ctx->index, &ep->active_mask); atomic_inc(&ep->submitted_urbs); } return 0; } /* * complete callback for urbs */ static void snd_complete_urb(struct urb *urb) { struct snd_urb_ctx *ctx = urb->context; struct snd_usb_endpoint *ep = ctx->ep; int err; if (unlikely(urb->status == -ENOENT || /* unlinked */ urb->status == -ENODEV || /* device removed */ urb->status == -ECONNRESET || /* unlinked */ urb->status == -ESHUTDOWN)) /* device disabled */ goto exit_clear; /* device disconnected */ if (unlikely(atomic_read(&ep->chip->shutdown))) goto exit_clear; if (unlikely(!ep_state_running(ep))) goto exit_clear; if (usb_pipeout(ep->pipe)) { retire_outbound_urb(ep, ctx); /* can be stopped during retire callback */ if (unlikely(!ep_state_running(ep))) goto exit_clear; /* in low-latency and implicit-feedback modes, push back the * URB to ready list at first, then process as much as possible */ if (ep->lowlatency_playback || snd_usb_endpoint_implicit_feedback_sink(ep)) { push_back_to_ready_list(ep, ctx); clear_bit(ctx->index, &ep->active_mask); snd_usb_queue_pending_output_urbs(ep, false); atomic_dec(&ep->submitted_urbs); /* decrement at last */ return; } /* in non-lowlatency mode, no error handling for prepare */ prepare_outbound_urb(ep, ctx, false); /* can be stopped during prepare callback */ if (unlikely(!ep_state_running(ep))) goto exit_clear; } else { retire_inbound_urb(ep, ctx); /* can be stopped during retire callback */ if (unlikely(!ep_state_running(ep))) goto exit_clear; prepare_inbound_urb(ep, ctx); } if (!atomic_read(&ep->chip->shutdown)) err = usb_submit_urb(urb, GFP_ATOMIC); else err = -ENODEV; if (err == 0) return; if (!atomic_read(&ep->chip->shutdown)) { usb_audio_err(ep->chip, "cannot submit urb (err = %d)\n", err); notify_xrun(ep); } exit_clear: clear_bit(ctx->index, &ep->active_mask); atomic_dec(&ep->submitted_urbs); } /* * Find or create a refcount object for the given interface * * The objects are released altogether in snd_usb_endpoint_free_all() */ static struct snd_usb_iface_ref * iface_ref_find(struct snd_usb_audio *chip, int iface) { struct snd_usb_iface_ref *ip; list_for_each_entry(ip, &chip->iface_ref_list, list) if (ip->iface == iface) return ip; ip = kzalloc(sizeof(*ip), GFP_KERNEL); if (!ip) return NULL; ip->iface = iface; list_add_tail(&ip->list, &chip->iface_ref_list); return ip; } /* Similarly, a refcount object for clock */ static struct snd_usb_clock_ref * clock_ref_find(struct snd_usb_audio *chip, int clock) { struct snd_usb_clock_ref *ref; list_for_each_entry(ref, &chip->clock_ref_list, list) if (ref->clock == clock) return ref; ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!ref) return NULL; ref->clock = clock; atomic_set(&ref->locked, 0); list_add_tail(&ref->list, &chip->clock_ref_list); return ref; } /* * Get the existing endpoint object corresponding EP * Returns NULL if not present. */ struct snd_usb_endpoint * snd_usb_get_endpoint(struct snd_usb_audio *chip, int ep_num) { struct snd_usb_endpoint *ep; list_for_each_entry(ep, &chip->ep_list, list) { if (ep->ep_num == ep_num) return ep; } return NULL; } #define ep_type_name(type) \ (type == SND_USB_ENDPOINT_TYPE_DATA ? "data" : "sync") /** * snd_usb_add_endpoint: Add an endpoint to an USB audio chip * * @chip: The chip * @ep_num: The number of the endpoint to use * @type: SND_USB_ENDPOINT_TYPE_DATA or SND_USB_ENDPOINT_TYPE_SYNC * * If the requested endpoint has not been added to the given chip before, * a new instance is created. * * Returns zero on success or a negative error code. * * New endpoints will be added to chip->ep_list and freed by * calling snd_usb_endpoint_free_all(). * * For SND_USB_ENDPOINT_TYPE_SYNC, the caller needs to guarantee that * bNumEndpoints > 1 beforehand. */ int snd_usb_add_endpoint(struct snd_usb_audio *chip, int ep_num, int type) { struct snd_usb_endpoint *ep; bool is_playback; ep = snd_usb_get_endpoint(chip, ep_num); if (ep) return 0; usb_audio_dbg(chip, "Creating new %s endpoint #%x\n", ep_type_name(type), ep_num); ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (!ep) return -ENOMEM; ep->chip = chip; spin_lock_init(&ep->lock); ep->type = type; ep->ep_num = ep_num; INIT_LIST_HEAD(&ep->ready_playback_urbs); atomic_set(&ep->submitted_urbs, 0); is_playback = ((ep_num & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT); ep_num &= USB_ENDPOINT_NUMBER_MASK; if (is_playback) ep->pipe = usb_sndisocpipe(chip->dev, ep_num); else ep->pipe = usb_rcvisocpipe(chip->dev, ep_num); list_add_tail(&ep->list, &chip->ep_list); return 0; } /* Set up syncinterval and maxsyncsize for a sync EP */ static void endpoint_set_syncinterval(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { struct usb_host_interface *alts; struct usb_endpoint_descriptor *desc; alts = snd_usb_get_host_interface(chip, ep->iface, ep->altsetting); if (!alts) return; desc = get_endpoint(alts, ep->ep_idx); if (desc->bLength >= USB_DT_ENDPOINT_AUDIO_SIZE && desc->bRefresh >= 1 && desc->bRefresh <= 9) ep->syncinterval = desc->bRefresh; else if (snd_usb_get_speed(chip->dev) == USB_SPEED_FULL) ep->syncinterval = 1; else if (desc->bInterval >= 1 && desc->bInterval <= 16) ep->syncinterval = desc->bInterval - 1; else ep->syncinterval = 3; ep->syncmaxsize = le16_to_cpu(desc->wMaxPacketSize); } static bool endpoint_compatible(struct snd_usb_endpoint *ep, const struct audioformat *fp, const struct snd_pcm_hw_params *params) { if (!ep->opened) return false; if (ep->cur_audiofmt != fp) return false; if (ep->cur_rate != params_rate(params) || ep->cur_format != params_format(params) || ep->cur_period_frames != params_period_size(params) || ep->cur_buffer_periods != params_periods(params)) return false; return true; } /* * Check whether the given fp and hw params are compatible with the current * setup of the target EP for implicit feedback sync */ bool snd_usb_endpoint_compatible(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep, const struct audioformat *fp, const struct snd_pcm_hw_params *params) { bool ret; mutex_lock(&chip->mutex); ret = endpoint_compatible(ep, fp, params); mutex_unlock(&chip->mutex); return ret; } /* * snd_usb_endpoint_open: Open the endpoint * * Called from hw_params to assign the endpoint to the substream. * It's reference-counted, and only the first opener is allowed to set up * arbitrary parameters. The later opener must be compatible with the * former opened parameters. * The endpoint needs to be closed via snd_usb_endpoint_close() later. * * Note that this function doesn't configure the endpoint. The substream * needs to set it up later via snd_usb_endpoint_set_params() and * snd_usb_endpoint_prepare(). */ struct snd_usb_endpoint * snd_usb_endpoint_open(struct snd_usb_audio *chip, const struct audioformat *fp, const struct snd_pcm_hw_params *params, bool is_sync_ep, bool fixed_rate) { struct snd_usb_endpoint *ep; int ep_num = is_sync_ep ? fp->sync_ep : fp->endpoint; mutex_lock(&chip->mutex); ep = snd_usb_get_endpoint(chip, ep_num); if (!ep) { usb_audio_err(chip, "Cannot find EP 0x%x to open\n", ep_num); goto unlock; } if (!ep->opened) { if (is_sync_ep) { ep->iface = fp->sync_iface; ep->altsetting = fp->sync_altsetting; ep->ep_idx = fp->sync_ep_idx; } else { ep->iface = fp->iface; ep->altsetting = fp->altsetting; ep->ep_idx = fp->ep_idx; } usb_audio_dbg(chip, "Open EP 0x%x, iface=%d:%d, idx=%d\n", ep_num, ep->iface, ep->altsetting, ep->ep_idx); ep->iface_ref = iface_ref_find(chip, ep->iface); if (!ep->iface_ref) { ep = NULL; goto unlock; } if (fp->protocol != UAC_VERSION_1) { ep->clock_ref = clock_ref_find(chip, fp->clock); if (!ep->clock_ref) { ep = NULL; goto unlock; } ep->clock_ref->opened++; } ep->cur_audiofmt = fp; ep->cur_channels = fp->channels; ep->cur_rate = params_rate(params); ep->cur_format = params_format(params); ep->cur_frame_bytes = snd_pcm_format_physical_width(ep->cur_format) * ep->cur_channels / 8; ep->cur_period_frames = params_period_size(params); ep->cur_period_bytes = ep->cur_period_frames * ep->cur_frame_bytes; ep->cur_buffer_periods = params_periods(params); if (ep->type == SND_USB_ENDPOINT_TYPE_SYNC) endpoint_set_syncinterval(chip, ep); ep->implicit_fb_sync = fp->implicit_fb; ep->need_setup = true; ep->need_prepare = true; ep->fixed_rate = fixed_rate; usb_audio_dbg(chip, " channels=%d, rate=%d, format=%s, period_bytes=%d, periods=%d, implicit_fb=%d\n", ep->cur_channels, ep->cur_rate, snd_pcm_format_name(ep->cur_format), ep->cur_period_bytes, ep->cur_buffer_periods, ep->implicit_fb_sync); } else { if (WARN_ON(!ep->iface_ref)) { ep = NULL; goto unlock; } if (!endpoint_compatible(ep, fp, params)) { usb_audio_err(chip, "Incompatible EP setup for 0x%x\n", ep_num); ep = NULL; goto unlock; } usb_audio_dbg(chip, "Reopened EP 0x%x (count %d)\n", ep_num, ep->opened); } if (!ep->iface_ref->opened++) ep->iface_ref->need_setup = true; ep->opened++; unlock: mutex_unlock(&chip->mutex); return ep; } /* * snd_usb_endpoint_set_sync: Link data and sync endpoints * * Pass NULL to sync_ep to unlink again */ void snd_usb_endpoint_set_sync(struct snd_usb_audio *chip, struct snd_usb_endpoint *data_ep, struct snd_usb_endpoint *sync_ep) { data_ep->sync_source = sync_ep; } /* * Set data endpoint callbacks and the assigned data stream * * Called at PCM trigger and cleanups. * Pass NULL to deactivate each callback. */ void snd_usb_endpoint_set_callback(struct snd_usb_endpoint *ep, int (*prepare)(struct snd_usb_substream *subs, struct urb *urb, bool in_stream_lock), void (*retire)(struct snd_usb_substream *subs, struct urb *urb), struct snd_usb_substream *data_subs) { ep->prepare_data_urb = prepare; ep->retire_data_urb = retire; if (data_subs) ep->lowlatency_playback = data_subs->lowlatency_playback; else ep->lowlatency_playback = false; WRITE_ONCE(ep->data_subs, data_subs); } static int endpoint_set_interface(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep, bool set) { int altset = set ? ep->altsetting : 0; int err; if (ep->iface_ref->altset == altset) return 0; usb_audio_dbg(chip, "Setting usb interface %d:%d for EP 0x%x\n", ep->iface, altset, ep->ep_num); err = usb_set_interface(chip->dev, ep->iface, altset); if (err < 0) { usb_audio_err_ratelimited( chip, "%d:%d: usb_set_interface failed (%d)\n", ep->iface, altset, err); return err; } if (chip->quirk_flags & QUIRK_FLAG_IFACE_DELAY) msleep(50); ep->iface_ref->altset = altset; return 0; } /* * snd_usb_endpoint_close: Close the endpoint * * Unreference the already opened endpoint via snd_usb_endpoint_open(). */ void snd_usb_endpoint_close(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { mutex_lock(&chip->mutex); usb_audio_dbg(chip, "Closing EP 0x%x (count %d)\n", ep->ep_num, ep->opened); if (!--ep->iface_ref->opened && !(chip->quirk_flags & QUIRK_FLAG_IFACE_SKIP_CLOSE)) endpoint_set_interface(chip, ep, false); if (!--ep->opened) { if (ep->clock_ref) { if (!--ep->clock_ref->opened) ep->clock_ref->rate = 0; } ep->iface = 0; ep->altsetting = 0; ep->cur_audiofmt = NULL; ep->cur_rate = 0; ep->iface_ref = NULL; ep->clock_ref = NULL; usb_audio_dbg(chip, "EP 0x%x closed\n", ep->ep_num); } mutex_unlock(&chip->mutex); } /* Prepare for suspening EP, called from the main suspend handler */ void snd_usb_endpoint_suspend(struct snd_usb_endpoint *ep) { ep->need_prepare = true; if (ep->iface_ref) ep->iface_ref->need_setup = true; if (ep->clock_ref) ep->clock_ref->rate = 0; } /* * wait until all urbs are processed. */ static int wait_clear_urbs(struct snd_usb_endpoint *ep) { unsigned long end_time = jiffies + msecs_to_jiffies(1000); int alive; if (atomic_read(&ep->state) != EP_STATE_STOPPING) return 0; do { alive = atomic_read(&ep->submitted_urbs); if (!alive) break; schedule_timeout_uninterruptible(1); } while (time_before(jiffies, end_time)); if (alive) usb_audio_err(ep->chip, "timeout: still %d active urbs on EP #%x\n", alive, ep->ep_num); if (ep_state_update(ep, EP_STATE_STOPPING, EP_STATE_STOPPED)) { ep->sync_sink = NULL; snd_usb_endpoint_set_callback(ep, NULL, NULL, NULL); } return 0; } /* sync the pending stop operation; * this function itself doesn't trigger the stop operation */ void snd_usb_endpoint_sync_pending_stop(struct snd_usb_endpoint *ep) { if (ep) wait_clear_urbs(ep); } /* * Stop active urbs * * This function moves the EP to STOPPING state if it's being RUNNING. */ static int stop_urbs(struct snd_usb_endpoint *ep, bool force, bool keep_pending) { unsigned int i; unsigned long flags; if (!force && atomic_read(&ep->running)) return -EBUSY; if (!ep_state_update(ep, EP_STATE_RUNNING, EP_STATE_STOPPING)) return 0; spin_lock_irqsave(&ep->lock, flags); INIT_LIST_HEAD(&ep->ready_playback_urbs); ep->next_packet_head = 0; ep->next_packet_queued = 0; spin_unlock_irqrestore(&ep->lock, flags); if (keep_pending) return 0; for (i = 0; i < ep->nurbs; i++) { if (test_bit(i, &ep->active_mask)) { if (!test_and_set_bit(i, &ep->unlink_mask)) { struct urb *u = ep->urb[i].urb; usb_unlink_urb(u); } } } return 0; } /* * release an endpoint's urbs */ static int release_urbs(struct snd_usb_endpoint *ep, bool force) { int i, err; /* route incoming urbs to nirvana */ snd_usb_endpoint_set_callback(ep, NULL, NULL, NULL); /* stop and unlink urbs */ err = stop_urbs(ep, force, false); if (err) return err; wait_clear_urbs(ep); for (i = 0; i < ep->nurbs; i++) release_urb_ctx(&ep->urb[i]); usb_free_coherent(ep->chip->dev, SYNC_URBS * 4, ep->syncbuf, ep->sync_dma); ep->syncbuf = NULL; ep->nurbs = 0; return 0; } /* * configure a data endpoint */ static int data_ep_set_params(struct snd_usb_endpoint *ep) { struct snd_usb_audio *chip = ep->chip; unsigned int maxsize, minsize, packs_per_ms, max_packs_per_urb; unsigned int max_packs_per_period, urbs_per_period, urb_packs; unsigned int max_urbs, i; const struct audioformat *fmt = ep->cur_audiofmt; int frame_bits = ep->cur_frame_bytes * 8; int tx_length_quirk = (has_tx_length_quirk(chip) && usb_pipeout(ep->pipe)); usb_audio_dbg(chip, "Setting params for data EP 0x%x, pipe 0x%x\n", ep->ep_num, ep->pipe); if (ep->cur_format == SNDRV_PCM_FORMAT_DSD_U16_LE && fmt->dsd_dop) { /* * When operating in DSD DOP mode, the size of a sample frame * in hardware differs from the actual physical format width * because we need to make room for the DOP markers. */ frame_bits += ep->cur_channels << 3; } ep->datainterval = fmt->datainterval; ep->stride = frame_bits >> 3; switch (ep->cur_format) { case SNDRV_PCM_FORMAT_U8: ep->silence_value = 0x80; break; case SNDRV_PCM_FORMAT_DSD_U8: case SNDRV_PCM_FORMAT_DSD_U16_LE: case SNDRV_PCM_FORMAT_DSD_U32_LE: case SNDRV_PCM_FORMAT_DSD_U16_BE: case SNDRV_PCM_FORMAT_DSD_U32_BE: ep->silence_value = 0x69; break; default: ep->silence_value = 0; } /* assume max. frequency is 50% higher than nominal */ ep->freqmax = ep->freqn + (ep->freqn >> 1); /* Round up freqmax to nearest integer in order to calculate maximum * packet size, which must represent a whole number of frames. * This is accomplished by adding 0x0.ffff before converting the * Q16.16 format into integer. * In order to accurately calculate the maximum packet size when * the data interval is more than 1 (i.e. ep->datainterval > 0), * multiply by the data interval prior to rounding. For instance, * a freqmax of 41 kHz will result in a max packet size of 6 (5.125) * frames with a data interval of 1, but 11 (10.25) frames with a * data interval of 2. * (ep->freqmax << ep->datainterval overflows at 8.192 MHz for the * maximum datainterval value of 3, at USB full speed, higher for * USB high speed, noting that ep->freqmax is in units of * frames per packet in Q16.16 format.) */ maxsize = (((ep->freqmax << ep->datainterval) + 0xffff) >> 16) * (frame_bits >> 3); if (tx_length_quirk) maxsize += sizeof(__le32); /* Space for length descriptor */ /* but wMaxPacketSize might reduce this */ if (ep->maxpacksize && ep->maxpacksize < maxsize) { /* whatever fits into a max. size packet */ unsigned int data_maxsize = maxsize = ep->maxpacksize; if (tx_length_quirk) /* Need to remove the length descriptor to calc freq */ data_maxsize -= sizeof(__le32); ep->freqmax = (data_maxsize / (frame_bits >> 3)) << (16 - ep->datainterval); } if (ep->fill_max) ep->curpacksize = ep->maxpacksize; else ep->curpacksize = maxsize; if (snd_usb_get_speed(chip->dev) != USB_SPEED_FULL) { packs_per_ms = 8 >> ep->datainterval; max_packs_per_urb = MAX_PACKS_HS; } else { packs_per_ms = 1; max_packs_per_urb = MAX_PACKS; } if (ep->sync_source && !ep->implicit_fb_sync) max_packs_per_urb = min(max_packs_per_urb, 1U << ep->sync_source->syncinterval); max_packs_per_urb = max(1u, max_packs_per_urb >> ep->datainterval); /* * Capture endpoints need to use small URBs because there's no way * to tell in advance where the next period will end, and we don't * want the next URB to complete much after the period ends. * * Playback endpoints with implicit sync much use the same parameters * as their corresponding capture endpoint. */ if (usb_pipein(ep->pipe) || ep->implicit_fb_sync) { /* make capture URBs <= 1 ms and smaller than a period */ urb_packs = min(max_packs_per_urb, packs_per_ms); while (urb_packs > 1 && urb_packs * maxsize >= ep->cur_period_bytes) urb_packs >>= 1; ep->nurbs = MAX_URBS; /* * Playback endpoints without implicit sync are adjusted so that * a period fits as evenly as possible in the smallest number of * URBs. The total number of URBs is adjusted to the size of the * ALSA buffer, subject to the MAX_URBS and MAX_QUEUE limits. */ } else { /* determine how small a packet can be */ minsize = (ep->freqn >> (16 - ep->datainterval)) * (frame_bits >> 3); /* with sync from device, assume it can be 12% lower */ if (ep->sync_source) minsize -= minsize >> 3; minsize = max(minsize, 1u); /* how many packets will contain an entire ALSA period? */ max_packs_per_period = DIV_ROUND_UP(ep->cur_period_bytes, minsize); /* how many URBs will contain a period? */ urbs_per_period = DIV_ROUND_UP(max_packs_per_period, max_packs_per_urb); /* how many packets are needed in each URB? */ urb_packs = DIV_ROUND_UP(max_packs_per_period, urbs_per_period); /* limit the number of frames in a single URB */ ep->max_urb_frames = DIV_ROUND_UP(ep->cur_period_frames, urbs_per_period); /* try to use enough URBs to contain an entire ALSA buffer */ max_urbs = min((unsigned) MAX_URBS, MAX_QUEUE * packs_per_ms / urb_packs); ep->nurbs = min(max_urbs, urbs_per_period * ep->cur_buffer_periods); } /* allocate and initialize data urbs */ for (i = 0; i < ep->nurbs; i++) { struct snd_urb_ctx *u = &ep->urb[i]; u->index = i; u->ep = ep; u->packets = urb_packs; u->buffer_size = maxsize * u->packets; if (fmt->fmt_type == UAC_FORMAT_TYPE_II) u->packets++; /* for transfer delimiter */ u->urb = usb_alloc_urb(u->packets, GFP_KERNEL); if (!u->urb) goto out_of_memory; u->urb->transfer_buffer = usb_alloc_coherent(chip->dev, u->buffer_size, GFP_KERNEL, &u->urb->transfer_dma); if (!u->urb->transfer_buffer) goto out_of_memory; u->urb->pipe = ep->pipe; u->urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; u->urb->interval = 1 << ep->datainterval; u->urb->context = u; u->urb->complete = snd_complete_urb; INIT_LIST_HEAD(&u->ready_list); } return 0; out_of_memory: release_urbs(ep, false); return -ENOMEM; } /* * configure a sync endpoint */ static int sync_ep_set_params(struct snd_usb_endpoint *ep) { struct snd_usb_audio *chip = ep->chip; int i; usb_audio_dbg(chip, "Setting params for sync EP 0x%x, pipe 0x%x\n", ep->ep_num, ep->pipe); ep->syncbuf = usb_alloc_coherent(chip->dev, SYNC_URBS * 4, GFP_KERNEL, &ep->sync_dma); if (!ep->syncbuf) return -ENOMEM; ep->nurbs = SYNC_URBS; for (i = 0; i < SYNC_URBS; i++) { struct snd_urb_ctx *u = &ep->urb[i]; u->index = i; u->ep = ep; u->packets = 1; u->urb = usb_alloc_urb(1, GFP_KERNEL); if (!u->urb) goto out_of_memory; u->urb->transfer_buffer = ep->syncbuf + i * 4; u->urb->transfer_dma = ep->sync_dma + i * 4; u->urb->transfer_buffer_length = 4; u->urb->pipe = ep->pipe; u->urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; u->urb->number_of_packets = 1; u->urb->interval = 1 << ep->syncinterval; u->urb->context = u; u->urb->complete = snd_complete_urb; } return 0; out_of_memory: release_urbs(ep, false); return -ENOMEM; } /* update the rate of the referred clock; return the actual rate */ static int update_clock_ref_rate(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { struct snd_usb_clock_ref *clock = ep->clock_ref; int rate = ep->cur_rate; if (!clock || clock->rate == rate) return rate; if (clock->rate) { if (atomic_read(&clock->locked)) return clock->rate; if (clock->rate != rate) { usb_audio_err(chip, "Mismatched sample rate %d vs %d for EP 0x%x\n", clock->rate, rate, ep->ep_num); return clock->rate; } } clock->rate = rate; clock->need_setup = true; return rate; } /* * snd_usb_endpoint_set_params: configure an snd_usb_endpoint * * It's called either from hw_params callback. * Determine the number of URBs to be used on this endpoint. * An endpoint must be configured before it can be started. * An endpoint that is already running can not be reconfigured. */ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { const struct audioformat *fmt = ep->cur_audiofmt; int err = 0; mutex_lock(&chip->mutex); if (!ep->need_setup) goto unlock; /* release old buffers, if any */ err = release_urbs(ep, false); if (err < 0) goto unlock; ep->datainterval = fmt->datainterval; ep->maxpacksize = fmt->maxpacksize; ep->fill_max = !!(fmt->attributes & UAC_EP_CS_ATTR_FILL_MAX); if (snd_usb_get_speed(chip->dev) == USB_SPEED_FULL) { ep->freqn = get_usb_full_speed_rate(ep->cur_rate); ep->pps = 1000 >> ep->datainterval; } else { ep->freqn = get_usb_high_speed_rate(ep->cur_rate); ep->pps = 8000 >> ep->datainterval; } ep->sample_rem = ep->cur_rate % ep->pps; ep->packsize[0] = ep->cur_rate / ep->pps; ep->packsize[1] = (ep->cur_rate + (ep->pps - 1)) / ep->pps; /* calculate the frequency in 16.16 format */ ep->freqm = ep->freqn; ep->freqshift = INT_MIN; ep->phase = 0; switch (ep->type) { case SND_USB_ENDPOINT_TYPE_DATA: err = data_ep_set_params(ep); break; case SND_USB_ENDPOINT_TYPE_SYNC: err = sync_ep_set_params(ep); break; default: err = -EINVAL; } usb_audio_dbg(chip, "Set up %d URBS, ret=%d\n", ep->nurbs, err); if (err < 0) goto unlock; /* some unit conversions in runtime */ ep->maxframesize = ep->maxpacksize / ep->cur_frame_bytes; ep->curframesize = ep->curpacksize / ep->cur_frame_bytes; err = update_clock_ref_rate(chip, ep); if (err >= 0) { ep->need_setup = false; err = 0; } unlock: mutex_unlock(&chip->mutex); return err; } static int init_sample_rate(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { struct snd_usb_clock_ref *clock = ep->clock_ref; int rate, err; rate = update_clock_ref_rate(chip, ep); if (rate < 0) return rate; if (clock && !clock->need_setup) return 0; if (!ep->fixed_rate) { err = snd_usb_init_sample_rate(chip, ep->cur_audiofmt, rate); if (err < 0) { if (clock) clock->rate = 0; /* reset rate */ return err; } } if (clock) clock->need_setup = false; return 0; } /* * snd_usb_endpoint_prepare: Prepare the endpoint * * This function sets up the EP to be fully usable state. * It's called either from prepare callback. * The function checks need_setup flag, and performs nothing unless needed, * so it's safe to call this multiple times. * * This returns zero if unchanged, 1 if the configuration has changed, * or a negative error code. */ int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, struct snd_usb_endpoint *ep) { bool iface_first; int err = 0; mutex_lock(&chip->mutex); if (WARN_ON(!ep->iface_ref)) goto unlock; if (!ep->need_prepare) goto unlock; /* If the interface has been already set up, just set EP parameters */ if (!ep->iface_ref->need_setup) { /* sample rate setup of UAC1 is per endpoint, and we need * to update at each EP configuration */ if (ep->cur_audiofmt->protocol == UAC_VERSION_1) { err = init_sample_rate(chip, ep); if (err < 0) goto unlock; } goto done; } /* Need to deselect altsetting at first */ endpoint_set_interface(chip, ep, false); /* Some UAC1 devices (e.g. Yamaha THR10) need the host interface * to be set up before parameter setups */ iface_first = ep->cur_audiofmt->protocol == UAC_VERSION_1; /* Workaround for devices that require the interface setup at first like UAC1 */ if (chip->quirk_flags & QUIRK_FLAG_SET_IFACE_FIRST) iface_first = true; if (iface_first) { err = endpoint_set_interface(chip, ep, true); if (err < 0) goto unlock; } err = snd_usb_init_pitch(chip, ep->cur_audiofmt); if (err < 0) goto unlock; err = init_sample_rate(chip, ep); if (err < 0) goto unlock; err = snd_usb_select_mode_quirk(chip, ep->cur_audiofmt); if (err < 0) goto unlock; /* for UAC2/3, enable the interface altset here at last */ if (!iface_first) { err = endpoint_set_interface(chip, ep, true); if (err < 0) goto unlock; } ep->iface_ref->need_setup = false; done: ep->need_prepare = false; err = 1; unlock: mutex_unlock(&chip->mutex); return err; } /* get the current rate set to the given clock by any endpoint */ int snd_usb_endpoint_get_clock_rate(struct snd_usb_audio *chip, int clock) { struct snd_usb_clock_ref *ref; int rate = 0; if (!clock) return 0; mutex_lock(&chip->mutex); list_for_each_entry(ref, &chip->clock_ref_list, list) { if (ref->clock == clock) { rate = ref->rate; break; } } mutex_unlock(&chip->mutex); return rate; } /** * snd_usb_endpoint_start: start an snd_usb_endpoint * * @ep: the endpoint to start * * A call to this function will increment the running count of the endpoint. * In case it is not already running, the URBs for this endpoint will be * submitted. Otherwise, this function does nothing. * * Must be balanced to calls of snd_usb_endpoint_stop(). * * Returns an error if the URB submission failed, 0 in all other cases. */ int snd_usb_endpoint_start(struct snd_usb_endpoint *ep) { bool is_playback = usb_pipeout(ep->pipe); int err; unsigned int i; if (atomic_read(&ep->chip->shutdown)) return -EBADFD; if (ep->sync_source) WRITE_ONCE(ep->sync_source->sync_sink, ep); usb_audio_dbg(ep->chip, "Starting %s EP 0x%x (running %d)\n", ep_type_name(ep->type), ep->ep_num, atomic_read(&ep->running)); /* already running? */ if (atomic_inc_return(&ep->running) != 1) return 0; if (ep->clock_ref) atomic_inc(&ep->clock_ref->locked); ep->active_mask = 0; ep->unlink_mask = 0; ep->phase = 0; ep->sample_accum = 0; snd_usb_endpoint_start_quirk(ep); /* * If this endpoint has a data endpoint as implicit feedback source, * don't start the urbs here. Instead, mark them all as available, * wait for the record urbs to return and queue the playback urbs * from that context. */ if (!ep_state_update(ep, EP_STATE_STOPPED, EP_STATE_RUNNING)) goto __error; if (snd_usb_endpoint_implicit_feedback_sink(ep) && !(ep->chip->quirk_flags & QUIRK_FLAG_PLAYBACK_FIRST)) { usb_audio_dbg(ep->chip, "No URB submission due to implicit fb sync\n"); i = 0; goto fill_rest; } for (i = 0; i < ep->nurbs; i++) { struct urb *urb = ep->urb[i].urb; if (snd_BUG_ON(!urb)) goto __error; if (is_playback) err = prepare_outbound_urb(ep, urb->context, true); else err = prepare_inbound_urb(ep, urb->context); if (err < 0) { /* stop filling at applptr */ if (err == -EAGAIN) break; usb_audio_dbg(ep->chip, "EP 0x%x: failed to prepare urb: %d\n", ep->ep_num, err); goto __error; } if (!atomic_read(&ep->chip->shutdown)) err = usb_submit_urb(urb, GFP_ATOMIC); else err = -ENODEV; if (err < 0) { if (!atomic_read(&ep->chip->shutdown)) usb_audio_err(ep->chip, "cannot submit urb %d, error %d: %s\n", i, err, usb_error_string(err)); goto __error; } set_bit(i, &ep->active_mask); atomic_inc(&ep->submitted_urbs); } if (!i) { usb_audio_dbg(ep->chip, "XRUN at starting EP 0x%x\n", ep->ep_num); goto __error; } usb_audio_dbg(ep->chip, "%d URBs submitted for EP 0x%x\n", i, ep->ep_num); fill_rest: /* put the remaining URBs to ready list */ if (is_playback) { for (; i < ep->nurbs; i++) push_back_to_ready_list(ep, ep->urb + i); } return 0; __error: snd_usb_endpoint_stop(ep, false); return -EPIPE; } /** * snd_usb_endpoint_stop: stop an snd_usb_endpoint * * @ep: the endpoint to stop (may be NULL) * @keep_pending: keep in-flight URBs * * A call to this function will decrement the running count of the endpoint. * In case the last user has requested the endpoint stop, the URBs will * actually be deactivated. * * Must be balanced to calls of snd_usb_endpoint_start(). * * The caller needs to synchronize the pending stop operation via * snd_usb_endpoint_sync_pending_stop(). */ void snd_usb_endpoint_stop(struct snd_usb_endpoint *ep, bool keep_pending) { if (!ep) return; usb_audio_dbg(ep->chip, "Stopping %s EP 0x%x (running %d)\n", ep_type_name(ep->type), ep->ep_num, atomic_read(&ep->running)); if (snd_BUG_ON(!atomic_read(&ep->running))) return; if (!atomic_dec_return(&ep->running)) { if (ep->sync_source) WRITE_ONCE(ep->sync_source->sync_sink, NULL); stop_urbs(ep, false, keep_pending); if (ep->clock_ref) atomic_dec(&ep->clock_ref->locked); if (ep->chip->quirk_flags & QUIRK_FLAG_FORCE_IFACE_RESET && usb_pipeout(ep->pipe)) { ep->need_prepare = true; if (ep->iface_ref) ep->iface_ref->need_setup = true; } } } /** * snd_usb_endpoint_release: Tear down an snd_usb_endpoint * * @ep: the endpoint to release * * This function does not care for the endpoint's running count but will tear * down all the streaming URBs immediately. */ void snd_usb_endpoint_release(struct snd_usb_endpoint *ep) { release_urbs(ep, true); } /** * snd_usb_endpoint_free_all: Free the resources of an snd_usb_endpoint * @chip: The chip * * This free all endpoints and those resources */ void snd_usb_endpoint_free_all(struct snd_usb_audio *chip) { struct snd_usb_endpoint *ep, *en; struct snd_usb_iface_ref *ip, *in; struct snd_usb_clock_ref *cp, *cn; list_for_each_entry_safe(ep, en, &chip->ep_list, list) kfree(ep); list_for_each_entry_safe(ip, in, &chip->iface_ref_list, list) kfree(ip); list_for_each_entry_safe(cp, cn, &chip->clock_ref_list, list) kfree(cp); } /* * snd_usb_handle_sync_urb: parse an USB sync packet * * @ep: the endpoint to handle the packet * @sender: the sending endpoint * @urb: the received packet * * This function is called from the context of an endpoint that received * the packet and is used to let another endpoint object handle the payload. */ static void snd_usb_handle_sync_urb(struct snd_usb_endpoint *ep, struct snd_usb_endpoint *sender, const struct urb *urb) { int shift; unsigned int f; unsigned long flags; snd_BUG_ON(ep == sender); /* * In case the endpoint is operating in implicit feedback mode, prepare * a new outbound URB that has the same layout as the received packet * and add it to the list of pending urbs. queue_pending_output_urbs() * will take care of them later. */ if (snd_usb_endpoint_implicit_feedback_sink(ep) && atomic_read(&ep->running)) { /* implicit feedback case */ int i, bytes = 0; struct snd_urb_ctx *in_ctx; struct snd_usb_packet_info *out_packet; in_ctx = urb->context; /* Count overall packet size */ for (i = 0; i < in_ctx->packets; i++) if (urb->iso_frame_desc[i].status == 0) bytes += urb->iso_frame_desc[i].actual_length; /* * skip empty packets. At least M-Audio's Fast Track Ultra stops * streaming once it received a 0-byte OUT URB */ if (bytes == 0) return; spin_lock_irqsave(&ep->lock, flags); if (ep->next_packet_queued >= ARRAY_SIZE(ep->next_packet)) { spin_unlock_irqrestore(&ep->lock, flags); usb_audio_err(ep->chip, "next package FIFO overflow EP 0x%x\n", ep->ep_num); notify_xrun(ep); return; } out_packet = next_packet_fifo_enqueue(ep); /* * Iterate through the inbound packet and prepare the lengths * for the output packet. The OUT packet we are about to send * will have the same amount of payload bytes per stride as the * IN packet we just received. Since the actual size is scaled * by the stride, use the sender stride to calculate the length * in case the number of channels differ between the implicitly * fed-back endpoint and the synchronizing endpoint. */ out_packet->packets = in_ctx->packets; for (i = 0; i < in_ctx->packets; i++) { if (urb->iso_frame_desc[i].status == 0) out_packet->packet_size[i] = urb->iso_frame_desc[i].actual_length / sender->stride; else out_packet->packet_size[i] = 0; } spin_unlock_irqrestore(&ep->lock, flags); snd_usb_queue_pending_output_urbs(ep, false); return; } /* * process after playback sync complete * * Full speed devices report feedback values in 10.14 format as samples * per frame, high speed devices in 16.16 format as samples per * microframe. * * Because the Audio Class 1 spec was written before USB 2.0, many high * speed devices use a wrong interpretation, some others use an * entirely different format. * * Therefore, we cannot predict what format any particular device uses * and must detect it automatically. */ if (urb->iso_frame_desc[0].status != 0 || urb->iso_frame_desc[0].actual_length < 3) return; f = le32_to_cpup(urb->transfer_buffer); if (urb->iso_frame_desc[0].actual_length == 3) f &= 0x00ffffff; else f &= 0x0fffffff; if (f == 0) return; if (unlikely(sender->tenor_fb_quirk)) { /* * Devices based on Tenor 8802 chipsets (TEAC UD-H01 * and others) sometimes change the feedback value * by +/- 0x1.0000. */ if (f < ep->freqn - 0x8000) f += 0xf000; else if (f > ep->freqn + 0x8000) f -= 0xf000; } else if (unlikely(ep->freqshift == INT_MIN)) { /* * The first time we see a feedback value, determine its format * by shifting it left or right until it matches the nominal * frequency value. This assumes that the feedback does not * differ from the nominal value more than +50% or -25%. */ shift = 0; while (f < ep->freqn - ep->freqn / 4) { f <<= 1; shift++; } while (f > ep->freqn + ep->freqn / 2) { f >>= 1; shift--; } ep->freqshift = shift; } else if (ep->freqshift >= 0) f <<= ep->freqshift; else f >>= -ep->freqshift; if (likely(f >= ep->freqn - ep->freqn / 8 && f <= ep->freqmax)) { /* * If the frequency looks valid, set it. * This value is referred to in prepare_playback_urb(). */ spin_lock_irqsave(&ep->lock, flags); ep->freqm = f; spin_unlock_irqrestore(&ep->lock, flags); } else { /* * Out of range; maybe the shift value is wrong. * Reset it so that we autodetect again the next time. */ ep->freqshift = INT_MIN; } } |
| 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Apple "Magic" Wireless Mouse driver * * Copyright (c) 2010 Michael Poole <mdpoole@troilus.org> * Copyright (c) 2010 Chase Douglas <chase.douglas@canonical.com> */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/hid.h> #include <linux/input/mt.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "hid-ids.h" static bool emulate_3button = true; module_param(emulate_3button, bool, 0644); MODULE_PARM_DESC(emulate_3button, "Emulate a middle button"); static int middle_button_start = -350; static int middle_button_stop = +350; static bool emulate_scroll_wheel = true; module_param(emulate_scroll_wheel, bool, 0644); MODULE_PARM_DESC(emulate_scroll_wheel, "Emulate a scroll wheel"); static unsigned int scroll_speed = 32; static int param_set_scroll_speed(const char *val, const struct kernel_param *kp) { unsigned long speed; if (!val || kstrtoul(val, 0, &speed) || speed > 63) return -EINVAL; scroll_speed = speed; return 0; } module_param_call(scroll_speed, param_set_scroll_speed, param_get_uint, &scroll_speed, 0644); MODULE_PARM_DESC(scroll_speed, "Scroll speed, value from 0 (slow) to 63 (fast)"); static bool scroll_acceleration = false; module_param(scroll_acceleration, bool, 0644); MODULE_PARM_DESC(scroll_acceleration, "Accelerate sequential scroll events"); static bool report_undeciphered; module_param(report_undeciphered, bool, 0644); MODULE_PARM_DESC(report_undeciphered, "Report undeciphered multi-touch state field using a MSC_RAW event"); #define TRACKPAD2_2021_BT_VERSION 0x110 #define TRACKPAD_REPORT_ID 0x28 #define TRACKPAD2_USB_REPORT_ID 0x02 #define TRACKPAD2_BT_REPORT_ID 0x31 #define MOUSE_REPORT_ID 0x29 #define MOUSE2_REPORT_ID 0x12 #define DOUBLE_REPORT_ID 0xf7 #define USB_BATTERY_TIMEOUT_MS 60000 /* These definitions are not precise, but they're close enough. (Bits * 0x03 seem to indicate the aspect ratio of the touch, bits 0x70 seem * to be some kind of bit mask -- 0x20 may be a near-field reading, * and 0x40 is actual contact, and 0x10 may be a start/stop or change * indication.) */ #define TOUCH_STATE_MASK 0xf0 #define TOUCH_STATE_NONE 0x00 #define TOUCH_STATE_START 0x30 #define TOUCH_STATE_DRAG 0x40 /* Number of high-resolution events for each low-resolution detent. */ #define SCROLL_HR_STEPS 10 #define SCROLL_HR_MULT (120 / SCROLL_HR_STEPS) #define SCROLL_HR_THRESHOLD 90 /* units */ #define SCROLL_ACCEL_DEFAULT 7 /* Touch surface information. Dimension is in hundredths of a mm, min and max * are in units. */ #define MOUSE_DIMENSION_X (float)9056 #define MOUSE_MIN_X -1100 #define MOUSE_MAX_X 1258 #define MOUSE_RES_X ((MOUSE_MAX_X - MOUSE_MIN_X) / (MOUSE_DIMENSION_X / 100)) #define MOUSE_DIMENSION_Y (float)5152 #define MOUSE_MIN_Y -1589 #define MOUSE_MAX_Y 2047 #define MOUSE_RES_Y ((MOUSE_MAX_Y - MOUSE_MIN_Y) / (MOUSE_DIMENSION_Y / 100)) #define TRACKPAD_DIMENSION_X (float)13000 #define TRACKPAD_MIN_X -2909 #define TRACKPAD_MAX_X 3167 #define TRACKPAD_RES_X \ ((TRACKPAD_MAX_X - TRACKPAD_MIN_X) / (TRACKPAD_DIMENSION_X / 100)) #define TRACKPAD_DIMENSION_Y (float)11000 #define TRACKPAD_MIN_Y -2456 #define TRACKPAD_MAX_Y 2565 #define TRACKPAD_RES_Y \ ((TRACKPAD_MAX_Y - TRACKPAD_MIN_Y) / (TRACKPAD_DIMENSION_Y / 100)) #define TRACKPAD2_DIMENSION_X (float)16000 #define TRACKPAD2_MIN_X -3678 #define TRACKPAD2_MAX_X 3934 #define TRACKPAD2_RES_X \ ((TRACKPAD2_MAX_X - TRACKPAD2_MIN_X) / (TRACKPAD2_DIMENSION_X / 100)) #define TRACKPAD2_DIMENSION_Y (float)11490 #define TRACKPAD2_MIN_Y -2478 #define TRACKPAD2_MAX_Y 2587 #define TRACKPAD2_RES_Y \ ((TRACKPAD2_MAX_Y - TRACKPAD2_MIN_Y) / (TRACKPAD2_DIMENSION_Y / 100)) /** * struct magicmouse_sc - Tracks Magic Mouse-specific data. * @input: Input device through which we report events. * @quirks: Currently unused. * @ntouches: Number of touches in most recent touch report. * @scroll_accel: Number of consecutive scroll motions. * @scroll_jiffies: Time of last scroll motion. * @touches: Most recent data for a touch, indexed by tracking ID. * @tracking_ids: Mapping of current touch input data to @touches. */ struct magicmouse_sc { struct input_dev *input; unsigned long quirks; int ntouches; int scroll_accel; unsigned long scroll_jiffies; struct { short x; short y; short scroll_x; short scroll_y; short scroll_x_hr; short scroll_y_hr; u8 size; bool scroll_x_active; bool scroll_y_active; } touches[16]; int tracking_ids[16]; struct hid_device *hdev; struct delayed_work work; struct timer_list battery_timer; }; static int magicmouse_firm_touch(struct magicmouse_sc *msc) { int touch = -1; int ii; /* If there is only one "firm" touch, set touch to its * tracking ID. */ for (ii = 0; ii < msc->ntouches; ii++) { int idx = msc->tracking_ids[ii]; if (msc->touches[idx].size < 8) { /* Ignore this touch. */ } else if (touch >= 0) { touch = -1; break; } else { touch = idx; } } return touch; } static void magicmouse_emit_buttons(struct magicmouse_sc *msc, int state) { int last_state = test_bit(BTN_LEFT, msc->input->key) << 0 | test_bit(BTN_RIGHT, msc->input->key) << 1 | test_bit(BTN_MIDDLE, msc->input->key) << 2; if (emulate_3button) { int id; /* If some button was pressed before, keep it held * down. Otherwise, if there's exactly one firm * touch, use that to override the mouse's guess. */ if (state == 0) { /* The button was released. */ } else if (last_state != 0) { state = last_state; } else if ((id = magicmouse_firm_touch(msc)) >= 0) { int x = msc->touches[id].x; if (x < middle_button_start) state = 1; else if (x > middle_button_stop) state = 2; else state = 4; } /* else: we keep the mouse's guess */ input_report_key(msc->input, BTN_MIDDLE, state & 4); } input_report_key(msc->input, BTN_LEFT, state & 1); input_report_key(msc->input, BTN_RIGHT, state & 2); if (state != last_state) msc->scroll_accel = SCROLL_ACCEL_DEFAULT; } static void magicmouse_emit_touch(struct magicmouse_sc *msc, int raw_id, u8 *tdata) { struct input_dev *input = msc->input; int id, x, y, size, orientation, touch_major, touch_minor, state, down; int pressure = 0; if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { id = (tdata[6] << 2 | tdata[5] >> 6) & 0xf; x = (tdata[1] << 28 | tdata[0] << 20) >> 20; y = -((tdata[2] << 24 | tdata[1] << 16) >> 20); size = tdata[5] & 0x3f; orientation = (tdata[6] >> 2) - 32; touch_major = tdata[3]; touch_minor = tdata[4]; state = tdata[7] & TOUCH_STATE_MASK; down = state != TOUCH_STATE_NONE; } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { id = tdata[8] & 0xf; x = (tdata[1] << 27 | tdata[0] << 19) >> 19; y = -((tdata[3] << 30 | tdata[2] << 22 | tdata[1] << 14) >> 19); size = tdata[6]; orientation = (tdata[8] >> 5) - 4; touch_major = tdata[4]; touch_minor = tdata[5]; pressure = tdata[7]; state = tdata[3] & 0xC0; down = state == 0x80; } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ id = (tdata[7] << 2 | tdata[6] >> 6) & 0xf; x = (tdata[1] << 27 | tdata[0] << 19) >> 19; y = -((tdata[3] << 30 | tdata[2] << 22 | tdata[1] << 14) >> 19); size = tdata[6] & 0x3f; orientation = (tdata[7] >> 2) - 32; touch_major = tdata[4]; touch_minor = tdata[5]; state = tdata[8] & TOUCH_STATE_MASK; down = state != TOUCH_STATE_NONE; } /* Store tracking ID and other fields. */ msc->tracking_ids[raw_id] = id; msc->touches[id].x = x; msc->touches[id].y = y; msc->touches[id].size = size; /* If requested, emulate a scroll wheel by detecting small * vertical touch motions. */ if (emulate_scroll_wheel && (input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2)) { unsigned long now = jiffies; int step_x = msc->touches[id].scroll_x - x; int step_y = msc->touches[id].scroll_y - y; int step_hr = max_t(int, ((64 - (int)scroll_speed) * msc->scroll_accel) / SCROLL_HR_STEPS, 1); int step_x_hr = msc->touches[id].scroll_x_hr - x; int step_y_hr = msc->touches[id].scroll_y_hr - y; /* Calculate and apply the scroll motion. */ switch (state) { case TOUCH_STATE_START: msc->touches[id].scroll_x = x; msc->touches[id].scroll_y = y; msc->touches[id].scroll_x_hr = x; msc->touches[id].scroll_y_hr = y; msc->touches[id].scroll_x_active = false; msc->touches[id].scroll_y_active = false; /* Reset acceleration after half a second. */ if (scroll_acceleration && time_before(now, msc->scroll_jiffies + HZ / 2)) msc->scroll_accel = max_t(int, msc->scroll_accel - 1, 1); else msc->scroll_accel = SCROLL_ACCEL_DEFAULT; break; case TOUCH_STATE_DRAG: step_x /= (64 - (int)scroll_speed) * msc->scroll_accel; if (step_x != 0) { msc->touches[id].scroll_x -= step_x * (64 - scroll_speed) * msc->scroll_accel; msc->scroll_jiffies = now; input_report_rel(input, REL_HWHEEL, -step_x); } step_y /= (64 - (int)scroll_speed) * msc->scroll_accel; if (step_y != 0) { msc->touches[id].scroll_y -= step_y * (64 - scroll_speed) * msc->scroll_accel; msc->scroll_jiffies = now; input_report_rel(input, REL_WHEEL, step_y); } if (!msc->touches[id].scroll_x_active && abs(step_x_hr) > SCROLL_HR_THRESHOLD) { msc->touches[id].scroll_x_active = true; msc->touches[id].scroll_x_hr = x; step_x_hr = 0; } step_x_hr /= step_hr; if (step_x_hr != 0 && msc->touches[id].scroll_x_active) { msc->touches[id].scroll_x_hr -= step_x_hr * step_hr; input_report_rel(input, REL_HWHEEL_HI_RES, -step_x_hr * SCROLL_HR_MULT); } if (!msc->touches[id].scroll_y_active && abs(step_y_hr) > SCROLL_HR_THRESHOLD) { msc->touches[id].scroll_y_active = true; msc->touches[id].scroll_y_hr = y; step_y_hr = 0; } step_y_hr /= step_hr; if (step_y_hr != 0 && msc->touches[id].scroll_y_active) { msc->touches[id].scroll_y_hr -= step_y_hr * step_hr; input_report_rel(input, REL_WHEEL_HI_RES, step_y_hr * SCROLL_HR_MULT); } break; } } if (down) msc->ntouches++; input_mt_slot(input, id); input_mt_report_slot_state(input, MT_TOOL_FINGER, down); /* Generate the input events for this touch. */ if (down) { input_report_abs(input, ABS_MT_TOUCH_MAJOR, touch_major << 2); input_report_abs(input, ABS_MT_TOUCH_MINOR, touch_minor << 2); input_report_abs(input, ABS_MT_ORIENTATION, -orientation); input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) input_report_abs(input, ABS_MT_PRESSURE, pressure); if (report_undeciphered) { if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) input_event(input, EV_MSC, MSC_RAW, tdata[7]); else if (input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) input_event(input, EV_MSC, MSC_RAW, tdata[8]); } } } static int magicmouse_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); struct input_dev *input = msc->input; int x = 0, y = 0, ii, clicks = 0, npoints; switch (data[0]) { case TRACKPAD_REPORT_ID: case TRACKPAD2_BT_REPORT_ID: /* Expect four bytes of prefix, and N*9 bytes of touch data. */ if (size < 4 || ((size - 4) % 9) != 0) return 0; npoints = (size - 4) / 9; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for TRACKPAD_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 9 + 4); clicks = data[1]; /* The following bits provide a device specific timestamp. They * are unused here. * * ts = data[1] >> 6 | data[2] << 2 | data[3] << 10; */ break; case TRACKPAD2_USB_REPORT_ID: /* Expect twelve bytes of prefix and N*9 bytes of touch data. */ if (size < 12 || ((size - 12) % 9) != 0) return 0; npoints = (size - 12) / 9; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for TRACKPAD2_USB_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 9 + 12); clicks = data[1]; break; case MOUSE_REPORT_ID: /* Expect six bytes of prefix, and N*8 bytes of touch data. */ if (size < 6 || ((size - 6) % 8) != 0) return 0; npoints = (size - 6) / 8; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for MOUSE_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 8 + 6); /* When emulating three-button mode, it is important * to have the current touch information before * generating a click event. */ x = (int)(((data[3] & 0x0c) << 28) | (data[1] << 22)) >> 22; y = (int)(((data[3] & 0x30) << 26) | (data[2] << 22)) >> 22; clicks = data[3]; /* The following bits provide a device specific timestamp. They * are unused here. * * ts = data[3] >> 6 | data[4] << 2 | data[5] << 10; */ break; case MOUSE2_REPORT_ID: /* Size is either 8 or (14 + 8 * N) */ if (size != 8 && (size < 14 || (size - 14) % 8 != 0)) return 0; npoints = (size - 14) / 8; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for MOUSE2_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 8 + 14); /* When emulating three-button mode, it is important * to have the current touch information before * generating a click event. */ x = (int)((data[3] << 24) | (data[2] << 16)) >> 16; y = (int)((data[5] << 24) | (data[4] << 16)) >> 16; clicks = data[1]; /* The following bits provide a device specific timestamp. They * are unused here. * * ts = data[11] >> 6 | data[12] << 2 | data[13] << 10; */ break; case DOUBLE_REPORT_ID: /* Sometimes the trackpad sends two touch reports in one * packet. */ magicmouse_raw_event(hdev, report, data + 2, data[1]); magicmouse_raw_event(hdev, report, data + 2 + data[1], size - 2 - data[1]); return 0; default: return 0; } if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { magicmouse_emit_buttons(msc, clicks & 3); input_report_rel(input, REL_X, x); input_report_rel(input, REL_Y, y); } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { input_mt_sync_frame(input); input_report_key(input, BTN_MOUSE, clicks & 1); } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ input_report_key(input, BTN_MOUSE, clicks & 1); input_mt_report_pointer_emulation(input, true); } input_sync(input); return 1; } static int magicmouse_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); if (msc->input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2 && field->report->id == MOUSE2_REPORT_ID) { /* * magic_mouse_raw_event has done all the work. Skip hidinput. * * Specifically, hidinput may modify BTN_LEFT and BTN_RIGHT, * breaking emulate_3button. */ return 1; } return 0; } static int magicmouse_setup_input(struct input_dev *input, struct hid_device *hdev) { int error; int mt_flags = 0; __set_bit(EV_KEY, input->evbit); if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { __set_bit(BTN_LEFT, input->keybit); __set_bit(BTN_RIGHT, input->keybit); if (emulate_3button) __set_bit(BTN_MIDDLE, input->keybit); __set_bit(EV_REL, input->evbit); __set_bit(REL_X, input->relbit); __set_bit(REL_Y, input->relbit); if (emulate_scroll_wheel) { __set_bit(REL_WHEEL, input->relbit); __set_bit(REL_HWHEEL, input->relbit); __set_bit(REL_WHEEL_HI_RES, input->relbit); __set_bit(REL_HWHEEL_HI_RES, input->relbit); } } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { /* If the trackpad has been connected to a Mac, the name is * automatically personalized, e.g., "José Expósito's Trackpad". * When connected through Bluetooth, the personalized name is * reported, however, when connected through USB the generic * name is reported. * Set the device name to ensure the same driver settings get * loaded, whether connected through bluetooth or USB. */ if (hdev->vendor == BT_VENDOR_ID_APPLE) { if (input->id.version == TRACKPAD2_2021_BT_VERSION) input->name = "Apple Inc. Magic Trackpad"; else input->name = "Apple Inc. Magic Trackpad 2"; } else { /* USB_VENDOR_ID_APPLE */ input->name = hdev->name; } __clear_bit(EV_MSC, input->evbit); __clear_bit(BTN_0, input->keybit); __clear_bit(BTN_RIGHT, input->keybit); __clear_bit(BTN_MIDDLE, input->keybit); __set_bit(BTN_MOUSE, input->keybit); __set_bit(INPUT_PROP_BUTTONPAD, input->propbit); __set_bit(BTN_TOOL_FINGER, input->keybit); mt_flags = INPUT_MT_POINTER | INPUT_MT_DROP_UNUSED | INPUT_MT_TRACK; } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ /* input->keybit is initialized with incorrect button info * for Magic Trackpad. There really is only one physical * button (BTN_LEFT == BTN_MOUSE). Make sure we don't * advertise buttons that don't exist... */ __clear_bit(BTN_RIGHT, input->keybit); __clear_bit(BTN_MIDDLE, input->keybit); __set_bit(BTN_MOUSE, input->keybit); __set_bit(BTN_TOOL_FINGER, input->keybit); __set_bit(BTN_TOOL_DOUBLETAP, input->keybit); __set_bit(BTN_TOOL_TRIPLETAP, input->keybit); __set_bit(BTN_TOOL_QUADTAP, input->keybit); __set_bit(BTN_TOOL_QUINTTAP, input->keybit); __set_bit(BTN_TOUCH, input->keybit); __set_bit(INPUT_PROP_POINTER, input->propbit); __set_bit(INPUT_PROP_BUTTONPAD, input->propbit); } __set_bit(EV_ABS, input->evbit); error = input_mt_init_slots(input, 16, mt_flags); if (error) return error; input_set_abs_params(input, ABS_MT_TOUCH_MAJOR, 0, 255 << 2, 4, 0); input_set_abs_params(input, ABS_MT_TOUCH_MINOR, 0, 255 << 2, 4, 0); /* Note: Touch Y position from the device is inverted relative * to how pointer motion is reported (and relative to how USB * HID recommends the coordinates work). This driver keeps * the origin at the same position, and just uses the additive * inverse of the reported Y. */ if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { input_set_abs_params(input, ABS_MT_ORIENTATION, -31, 32, 1, 0); input_set_abs_params(input, ABS_MT_POSITION_X, MOUSE_MIN_X, MOUSE_MAX_X, 4, 0); input_set_abs_params(input, ABS_MT_POSITION_Y, MOUSE_MIN_Y, MOUSE_MAX_Y, 4, 0); input_abs_set_res(input, ABS_MT_POSITION_X, MOUSE_RES_X); input_abs_set_res(input, ABS_MT_POSITION_Y, MOUSE_RES_Y); } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { input_set_abs_params(input, ABS_MT_PRESSURE, 0, 253, 0, 0); input_set_abs_params(input, ABS_PRESSURE, 0, 253, 0, 0); input_set_abs_params(input, ABS_MT_ORIENTATION, -3, 4, 0, 0); input_set_abs_params(input, ABS_X, TRACKPAD2_MIN_X, TRACKPAD2_MAX_X, 0, 0); input_set_abs_params(input, ABS_Y, TRACKPAD2_MIN_Y, TRACKPAD2_MAX_Y, 0, 0); input_set_abs_params(input, ABS_MT_POSITION_X, TRACKPAD2_MIN_X, TRACKPAD2_MAX_X, 0, 0); input_set_abs_params(input, ABS_MT_POSITION_Y, TRACKPAD2_MIN_Y, TRACKPAD2_MAX_Y, 0, 0); input_abs_set_res(input, ABS_X, TRACKPAD2_RES_X); input_abs_set_res(input, ABS_Y, TRACKPAD2_RES_Y); input_abs_set_res(input, ABS_MT_POSITION_X, TRACKPAD2_RES_X); input_abs_set_res(input, ABS_MT_POSITION_Y, TRACKPAD2_RES_Y); } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ input_set_abs_params(input, ABS_MT_ORIENTATION, -31, 32, 1, 0); input_set_abs_params(input, ABS_X, TRACKPAD_MIN_X, TRACKPAD_MAX_X, 4, 0); input_set_abs_params(input, ABS_Y, TRACKPAD_MIN_Y, TRACKPAD_MAX_Y, 4, 0); input_set_abs_params(input, ABS_MT_POSITION_X, TRACKPAD_MIN_X, TRACKPAD_MAX_X, 4, 0); input_set_abs_params(input, ABS_MT_POSITION_Y, TRACKPAD_MIN_Y, TRACKPAD_MAX_Y, 4, 0); input_abs_set_res(input, ABS_X, TRACKPAD_RES_X); input_abs_set_res(input, ABS_Y, TRACKPAD_RES_Y); input_abs_set_res(input, ABS_MT_POSITION_X, TRACKPAD_RES_X); input_abs_set_res(input, ABS_MT_POSITION_Y, TRACKPAD_RES_Y); } input_set_events_per_packet(input, 60); if (report_undeciphered && input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { __set_bit(EV_MSC, input->evbit); __set_bit(MSC_RAW, input->mscbit); } /* * hid-input may mark device as using autorepeat, but neither * the trackpad, nor the mouse actually want it. */ __clear_bit(EV_REP, input->evbit); return 0; } static int magicmouse_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); if (!msc->input) msc->input = hi->input; /* Magic Trackpad does not give relative data after switching to MT */ if ((hi->input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD || hi->input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) && field->flags & HID_MAIN_ITEM_RELATIVE) return -1; return 0; } static int magicmouse_input_configured(struct hid_device *hdev, struct hid_input *hi) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); int ret; ret = magicmouse_setup_input(msc->input, hdev); if (ret) { hid_err(hdev, "magicmouse setup input failed (%d)\n", ret); /* clean msc->input to notify probe() of the failure */ msc->input = NULL; return ret; } return 0; } static int magicmouse_enable_multitouch(struct hid_device *hdev) { const u8 *feature; const u8 feature_mt[] = { 0xD7, 0x01 }; const u8 feature_mt_mouse2[] = { 0xF1, 0x02, 0x01 }; const u8 feature_mt_trackpad2_usb[] = { 0x02, 0x01 }; const u8 feature_mt_trackpad2_bt[] = { 0xF1, 0x02, 0x01 }; u8 *buf; int ret; int feature_size; if (hdev->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { if (hdev->vendor == BT_VENDOR_ID_APPLE) { feature_size = sizeof(feature_mt_trackpad2_bt); feature = feature_mt_trackpad2_bt; } else { /* USB_VENDOR_ID_APPLE */ feature_size = sizeof(feature_mt_trackpad2_usb); feature = feature_mt_trackpad2_usb; } } else if (hdev->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { feature_size = sizeof(feature_mt_mouse2); feature = feature_mt_mouse2; } else { feature_size = sizeof(feature_mt); feature = feature_mt; } buf = kmemdup(feature, feature_size, GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_raw_request(hdev, buf[0], buf, feature_size, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); kfree(buf); return ret; } static void magicmouse_enable_mt_work(struct work_struct *work) { struct magicmouse_sc *msc = container_of(work, struct magicmouse_sc, work.work); int ret; ret = magicmouse_enable_multitouch(msc->hdev); if (ret < 0) hid_err(msc->hdev, "unable to request touch data (%d)\n", ret); } static int magicmouse_fetch_battery(struct hid_device *hdev) { #ifdef CONFIG_HID_BATTERY_STRENGTH struct hid_report_enum *report_enum; struct hid_report *report; if (!hdev->battery || hdev->vendor != USB_VENDOR_ID_APPLE || (hdev->product != USB_DEVICE_ID_APPLE_MAGICMOUSE2 && hdev->product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2)) return -1; report_enum = &hdev->report_enum[hdev->battery_report_type]; report = report_enum->report_id_hash[hdev->battery_report_id]; if (!report || report->maxfield < 1) return -1; if (hdev->battery_capacity == hdev->battery_max) return -1; hid_hw_request(hdev, report, HID_REQ_GET_REPORT); return 0; #else return -1; #endif } static void magicmouse_battery_timer_tick(struct timer_list *t) { struct magicmouse_sc *msc = from_timer(msc, t, battery_timer); struct hid_device *hdev = msc->hdev; if (magicmouse_fetch_battery(hdev) == 0) { mod_timer(&msc->battery_timer, jiffies + msecs_to_jiffies(USB_BATTERY_TIMEOUT_MS)); } } static int magicmouse_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct magicmouse_sc *msc; struct hid_report *report; int ret; msc = devm_kzalloc(&hdev->dev, sizeof(*msc), GFP_KERNEL); if (msc == NULL) { hid_err(hdev, "can't alloc magicmouse descriptor\n"); return -ENOMEM; } msc->scroll_accel = SCROLL_ACCEL_DEFAULT; msc->hdev = hdev; INIT_DEFERRABLE_WORK(&msc->work, magicmouse_enable_mt_work); msc->quirks = id->driver_data; hid_set_drvdata(hdev, msc); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "magicmouse hid parse failed\n"); return ret; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "magicmouse hw start failed\n"); return ret; } timer_setup(&msc->battery_timer, magicmouse_battery_timer_tick, 0); mod_timer(&msc->battery_timer, jiffies + msecs_to_jiffies(USB_BATTERY_TIMEOUT_MS)); magicmouse_fetch_battery(hdev); if (id->vendor == USB_VENDOR_ID_APPLE && (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2 || (id->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 && hdev->type != HID_TYPE_USBMOUSE))) return 0; if (!msc->input) { hid_err(hdev, "magicmouse input not registered\n"); ret = -ENOMEM; goto err_stop_hw; } if (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE) report = hid_register_report(hdev, HID_INPUT_REPORT, MOUSE_REPORT_ID, 0); else if (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) report = hid_register_report(hdev, HID_INPUT_REPORT, MOUSE2_REPORT_ID, 0); else if (id->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) { if (id->vendor == BT_VENDOR_ID_APPLE) report = hid_register_report(hdev, HID_INPUT_REPORT, TRACKPAD2_BT_REPORT_ID, 0); else /* USB_VENDOR_ID_APPLE */ report = hid_register_report(hdev, HID_INPUT_REPORT, TRACKPAD2_USB_REPORT_ID, 0); } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ report = hid_register_report(hdev, HID_INPUT_REPORT, TRACKPAD_REPORT_ID, 0); report = hid_register_report(hdev, HID_INPUT_REPORT, DOUBLE_REPORT_ID, 0); } if (!report) { hid_err(hdev, "unable to register touch report\n"); ret = -ENOMEM; goto err_stop_hw; } report->size = 6; /* * Some devices repond with 'invalid report id' when feature * report switching it into multitouch mode is sent to it. * * This results in -EIO from the _raw low-level transport callback, * but there seems to be no other way of switching the mode. * Thus the super-ugly hacky success check below. */ ret = magicmouse_enable_multitouch(hdev); if (ret != -EIO && ret < 0) { hid_err(hdev, "unable to request touch data (%d)\n", ret); goto err_stop_hw; } if (ret == -EIO && id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { schedule_delayed_work(&msc->work, msecs_to_jiffies(500)); } return 0; err_stop_hw: del_timer_sync(&msc->battery_timer); hid_hw_stop(hdev); return ret; } static void magicmouse_remove(struct hid_device *hdev) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); if (msc) { cancel_delayed_work_sync(&msc->work); del_timer_sync(&msc->battery_timer); } hid_hw_stop(hdev); } static __u8 *magicmouse_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { /* * Change the usage from: * 0x06, 0x00, 0xff, // Usage Page (Vendor Defined Page 1) 0 * 0x09, 0x0b, // Usage (Vendor Usage 0x0b) 3 * To: * 0x05, 0x01, // Usage Page (Generic Desktop) 0 * 0x09, 0x02, // Usage (Mouse) 2 */ if (hdev->vendor == USB_VENDOR_ID_APPLE && (hdev->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2 || hdev->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) && *rsize == 83 && rdesc[46] == 0x84 && rdesc[58] == 0x85) { hid_info(hdev, "fixing up magicmouse battery report descriptor\n"); *rsize = *rsize - 1; rdesc = kmemdup(rdesc + 1, *rsize, GFP_KERNEL); if (!rdesc) return NULL; rdesc[0] = 0x05; rdesc[1] = 0x01; rdesc[2] = 0x09; rdesc[3] = 0x02; } return rdesc; } static const struct hid_device_id magic_mice[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE2), .driver_data = 0 }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE2), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2), .driver_data = 0 }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2), .driver_data = 0 }, { } }; MODULE_DEVICE_TABLE(hid, magic_mice); static struct hid_driver magicmouse_driver = { .name = "magicmouse", .id_table = magic_mice, .probe = magicmouse_probe, .remove = magicmouse_remove, .report_fixup = magicmouse_report_fixup, .raw_event = magicmouse_raw_event, .event = magicmouse_event, .input_mapping = magicmouse_input_mapping, .input_configured = magicmouse_input_configured, }; module_hid_driver(magicmouse_driver); MODULE_LICENSE("GPL"); |
| 4331 2990 2663 4369 2777 2777 616 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #ifndef pgd_offset_k #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) #endif /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. This mode can only be entered and left under the protection of * the page table locks for all page tables which may be modified. In the UP * case, this is required so that preemption is disabled, and in the SMP case, * it must synchronize the delayed page table writes properly on other CPUs. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); } arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return false; } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, pte); return pte; } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_get_and_clear(mm, addr, ptep); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { } #define __HAVE_ARCH_UPDATE_MMU_TLB #endif /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use * in scenarios where it is known ahead of time that the routine is * setting non-present entries, or re-setting an existing entry to the * same value. Otherwise, use the typical "set" helpers and flush the * TLB. */ #define set_pte_safe(ptep, pte) \ ({ \ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ set_pte(ptep, pte); \ }) #define set_pmd_safe(pmdp, pmd) \ ({ \ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ set_pmd(pmdp, pmd); \ }) #define set_pud_safe(pudp, pud) \ ({ \ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ set_pud(pudp, pud); \ }) #define set_p4d_safe(p4dp, p4d) \ ({ \ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ set_p4d(p4dp, p4d); \ }) #define set_pgd_safe(pgdp, pgd) \ ({ \ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte) { } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct page *page) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_PGD_OFFSET_GATE #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ /* * track_pfn_remap is called when a _new_ pfn mapping is being established * by remap_pfn_range() for physical range indicated by pfn and size. */ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { return 0; } /* * track_pfn_insert is called when a _new_ single pfn is established * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). */ static inline int track_pfn_copy(struct vm_area_struct *vma) { return 0; } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { } /* * untrack_pfn_clear is called while mremapping a pfnmap for a new region * or fails to copy pgtable during duplicate vm area. */ static inline void untrack_pfn_clear(struct vm_area_struct *vma) { } #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); extern void untrack_pfn_clear(struct vm_area_struct *vma); #endif #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() set of functions an in the generic * vmalloc/ioremap code to track at which page-table levels entries have been * modified. Based on that the code can better decide when vmalloc and ioremap * mapping changes need to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * p?d_leaf() - true if this entry is a final mapping to a physical address. * This differs from p?d_huge() by the fact that they are always available (if * the architecture supports large pages at the appropriate level) even * if CONFIG_HUGETLB_PAGE is not defined. * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf #define pgd_leaf(x) 0 #endif #ifndef p4d_leaf #define p4d_leaf(x) 0 #endif #ifndef pud_leaf #define pud_leaf(x) 0 #endif #ifndef pmd_leaf #define pmd_leaf(x) 0 #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(unsigned long vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */ |
| 2 2 2 2 2 2 2 1010 1010 1010 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 | /* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "cgroup-internal.h" #include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/hashtable.h> #include <linux/idr.h> #include <linux/kthread.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added. */ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); #ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); /* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding. */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_destroy_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of cgroup subsystem names */ #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ #define SUBSYS(_x) \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, static struct static_key_true *cgroup_subsys_enabled_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, static struct static_key_true *cgroup_subsys_on_dfl_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* the default hierarchy */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ static bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); /* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex. */ static u64 css_serial_nr_next = 1; /* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { #ifdef CONFIG_PSI OPT_FEATURE_PRESSURE, #endif OPT_FEATURE_COUNT }; static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { #ifdef CONFIG_PSI "pressure", #endif }; static u16 cgroup_feature_disable_mask __read_mostly; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline #define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); #include <linux/cgroup_refcnt.h> #endif /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ bool cgroup_ssid_enabled(int ssid) { if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) { int ret; idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); spin_unlock_bh(&cgroup_idr_lock); } static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } /* can @cgrp host both domain and threaded children? */ static bool cgroup_is_mixable(struct cgroup *cgrp) { /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time. */ return !cgroup_parent(cgrp); } /* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; /* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) return false; /* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) return false; /* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return false; return true; } /* is @cgrp root of a threaded subtree? */ static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) return false; /* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) return true; /* * A domain which has tasks and explicit threaded controllers * enabled is a thread root. */ if (cgroup_has_tasks(cgrp) && (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) return true; return false; } /* a domain which isn't connected to the root w/o brekage can't be used */ static bool cgroup_is_valid_domain(struct cgroup *cgrp) { /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) return false; /* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) return false; if (cgroup_is_threaded(cgrp)) return false; } return true; } /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { u16 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | cgrp_dfl_implicit_ss_mask); return root_ss_mask; } /* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { u16 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } return cgrp->root->subsys_mask; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->self; } /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); if (!ss) return &cgrp->self; /* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask. */ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL; } return cgroup_css(cgrp, ss); } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; do { css = cgroup_css(cgrp, ss); if (css) return css; cgrp = cgroup_parent(cgrp); } while (cgrp); return init_css_set.subsys[ss->id]; } /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put(). */ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL; rcu_read_lock(); do { css = cgroup_css(cgrp, ss); if (css && css_tryget_online(css)) goto out_unlock; cgrp = cgroup_parent(cgrp); } while (cgrp); css = init_css_set.subsys[ss->id]; css_get(css); out_unlock: rcu_read_unlock(); return css; } EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); cgroup_get(cgrp); } /** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question */ int __cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += link->cset->nr_tasks; return count; } /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ int cgroup_task_count(const struct cgroup *cgrp) { int count; spin_lock_irq(&css_set_lock); count = __cgroup_task_count(cgrp); spin_unlock_irq(&css_set_lock); return count; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set. */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ (ss) = cgroup_subsys[ssid]; \ { #define while_each_subsys_mask() \ } \ } \ } while (false) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else /* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot. */ .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ static bool css_set_threaded(struct css_set *cset) { return cset->dom_cset != cset; } /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } /** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { struct cgroup *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { bool was_populated = cgroup_is_populated(cgrp); if (!child) { cgrp->nr_populated_csets += adj; } else { if (cgroup_is_threaded(child)) cgrp->nr_populated_threaded_children += adj; else cgrp->nr_populated_domain_children += adj; } if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } /** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); } /* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details. */ static void css_set_skip_task_iters(struct css_set *cset, struct task_struct *task) { struct css_task_iter *it, *pos; list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) css_task_iter_skip(it, task); } /** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); } else { WARN_ON_ONCE(!list_empty(&task->cg_list)); } if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } } /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&css_set_lock); if (!refcount_dec_and_test(&cset->refcount)) return; WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); if (cgroup_parent(link->cgrp)) cgroup_put(link->cgrp); kfree(link); } if (css_set_threaded(cset)) { list_del(&cset->threaded_csets_node); put_css_set_locked(cset->dom_cset); } kfree_rcu(cset, rcu_head); } /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match. */ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; /* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp)) new_dfl_cgrp = new_cgrp; else new_dfl_cgrp = old_cset->dfl_cgrp; if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) return false; /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary. */ l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) { BUG_ON(l2 != &old_cset->cgrp_links); break; } else { BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); cgrp1 = link1->cgrp; cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) return false; } else { if (cgrp1 != cgrp2) return false; } } return true; } /** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want * to change the css. */ template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); hash_for_each_possible(css_set_table, cset, hlist, key) { if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ return cset; } /* No existing cgroup group matched */ return NULL; } static void free_cgrp_cset_links(struct list_head *links_to_free) { struct cgrp_cset_link *link, *tmp_link; list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { list_del(&link->cset_link); kfree(link); } } /** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno. */ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { struct cgrp_cset_link *link; int i; INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; } list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) { struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); if (cgroup_on_dfl(cgrp)) cset->dfl_cgrp = cgrp; link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; /* * Always add links to the tail of the lists so that the lists are * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) cgroup_get_live(cgrp); } /** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsigned long key; int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); spin_unlock_irq(&css_set_lock); if (cset) return cset; cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } refcount_set(&cset->refcount, 1); cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_src_preload_node); INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; link_css_set(&tmp_links, cset, c); } BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cset->subsys[ssid]; list_add_tail(&cset->e_cset_node[ssid], &css->cgroup->e_csets[ssid]); css_get(css); } spin_unlock_irq(&css_set_lock); /* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return. */ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset; dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); if (!dcset) { put_css_set(cset); return NULL; } spin_lock_irq(&css_set_lock); cset->dom_cset = dcset; list_add_tail(&cset->threaded_csets_node, &dcset->threaded_csets); spin_unlock_irq(&css_set_lock); } return cset; } struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ if (favor && !favoring) { rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } } static int cgroup_init_root_id(struct cgroup_root *root) { int id; lockdep_assert_held(&cgroup_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; root->hierarchy_id = id; return 0; } static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } void cgroup_free_root(struct cgroup_root *root) { kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; trace_cgroup_destroy_root(root); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); cgroup_root_count--; } cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); cgroup_unlock(); cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* * Returned cgroup is without refcount but it's valid as long as cset pins it. */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res_cgroup = NULL; if (cset == &init_css_set) { res_cgroup = &root->cgrp; } else if (root == &cgrp_dfl_root) { res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res_cgroup = c; break; } } } BUG_ON(!res_cgroup); return res_cgroup; } /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy */ static struct cgroup * current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; struct css_set *cset; lockdep_assert_held(&css_set_lock); rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; res = __cset_cgroup_from_root(cset, root); rcu_read_unlock(); return res; } /* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously. */ static struct cgroup *current_cgns_cgroup_dfl(void) { struct css_set *cset; if (current->nsproxy) { cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root); } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_task_namespaces() and * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all * cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); } /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* * No need to lock the task - since we hold css_set_lock the * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } /* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); } else { strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE) mode |= S_IWUGO; else mode |= S_IWUSR; } return mode; } /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; cur_ss_mask = new_ss_mask; } return cur_ss_mask; } /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); } /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ if (!cgroup_tryget(cgrp)) return NULL; kernfs_break_active_protection(kn); if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; cgroup_kn_unlock(kn); return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) return; css->flags &= ~CSS_VISIBLE; if (!css->ss) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); if (cgroup_psi_enabled()) cgroup_addrm_files(css, cgrp, cgroup_psi_files, false); } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, false); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } } /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css * * On failure, no file is added. */ static int css_populate_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(&cgrp->self, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(&cgrp->self, cgrp, cgroup_psi_files, true); if (ret < 0) return ret; } } else { cgroup_addrm_files(css, cgrp, cgroup1_base_files, true); } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); if (ret < 0) { failed_cfts = cfts; goto err; } } } css->flags |= CSS_VISIBLE; return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { if (cfts == failed_cfts) break; cgroup_addrm_files(css, cgrp, cfts, false); } return ret; } int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; /* * Collect ssid's that need to be disabled from default * hierarchy. */ if (ss->root == &cgrp_dfl_root) dfl_disable_ss_mask |= 1 << ssid; } while_each_subsys_mask(); if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset, *cset_pos; struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { /* disable from the source */ src_root->subsys_mask &= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here. */ list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id]) it->cset_head = &dcgrp->e_csets[ss->id]; } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\n", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); struct cgroup *ns_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); len = 0; } kfree(buf); return len; } enum cgroup2_param { Opt_nsdelegate, Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), {} }; static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct fs_parse_result result; int opt; opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; } return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { if (root_flags & CGRP_ROOT_NS_DELEGATE) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; cgroup_favor_dynmods(&cgrp_dfl_root, root_flags & CGRP_ROOT_FAVOR_DYNMODS); if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; } } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); return 0; } static int cgroup_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); apply_cgroup_root_flags(ctx->flags); return 0; } static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; int ssid; INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_fs_context *ctx) { struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); /* DYNMODS must be modified through cgroup_favor_dynmods() */ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats; ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); ret = 0; goto out; exit_stats: cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; } int cgroup_do_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; ctx->kfc.root = ctx->root->kf_root; if (fc->fs_type == &cgroup2_fs_type) ctx->kfc.magic = CGROUP2_SUPER_MAGIC; else ctx->kfc.magic = CGROUP_SUPER_MAGIC; ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); if (IS_ERR(nsdentry)) { deactivate_locked_super(sb); ret = PTR_ERR(nsdentry); nsdentry = NULL; } fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) cgroup_put(&ctx->root->cgrp); return ret; } /* * Destroy a cgroup filesystem context. */ static void cgroup_fs_context_free(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); kfree(ctx->name); kfree(ctx->release_agent); put_cgroup_ns(ctx->ns); kernfs_free_fs_context(fc); kfree(ctx); } static int cgroup_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; ret = cgroup_do_get_tree(fc); if (!ret) apply_cgroup_root_flags(ctx->flags); return ret; } static const struct fs_context_operations cgroup_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup2_parse_param, .get_tree = cgroup_get_tree, .reconfigure = cgroup_reconfigure, }; static const struct fs_context_operations cgroup1_fs_context_ops = { .free = cgroup_fs_context_free, .parse_param = cgroup1_parse_param, .get_tree = cgroup1_get_tree, .reconfigure = cgroup1_reconfigure, }; /* * Initialise the cgroup filesystem creation/reconfiguration context. Notably, * we select the namespace we're going to use. */ static int cgroup_init_fs_context(struct fs_context *fc) { struct cgroup_fs_context *ctx; ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; if (fc->fs_type == &cgroup2_fs_type) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; #ifdef CONFIG_CGROUP_FAVOR_DYNMODS ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; #endif return 0; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { cgroup_bpf_offline(&root->cgrp); percpu_ref_kill(&root->cgrp.self.refcnt); } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { .name = "cgroup", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup1_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .init_fs_context = cgroup_init_fs_context, .parameters = cgroup2_fs_parameters, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; #ifdef CONFIG_CPUSETS static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, }; /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ static int cpuset_init_fs_context(struct fs_context *fc) { char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); struct cgroup_fs_context *ctx; int err; err = cgroup_init_fs_context(fc); if (err) { kfree(agent); return err; } fc->ops = &cpuset_fs_context_ops; ctx = cgroup_fc2context(fc); ctx->subsys_mask = 1 << cpuset_cgrp_id; ctx->flags |= CGRP_ROOT_NOPREFIX; ctx->release_agent = agent; get_filesystem(&cgroup_fs_type); put_filesystem(fc->fs_type); fc->fs_type = &cgroup_fs_type; return 0; } static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, .fs_flags = FS_USERNS_MOUNT, }; #endif int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { int ret; cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() * implementations (e.g. cpuset), also need to disable CPU hotplug. * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can * lead to deadlocks. * * Bringing up a CPU may involve creating and destroying tasks which requires * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while * write-locking threadgroup_rwsem, the locking order is reversed and we end up * waiting for an on-going CPU hotplug operation which in turn is waiting for * the threadgroup_rwsem to be released to create new tasks. For more details: * * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu * * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ void cgroup_attach_lock(bool lock_threadgroup) { cpus_read_lock(); if (lock_threadgroup) percpu_down_write(&cgroup_threadgroup_rwsem); } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ void cgroup_attach_unlock(bool lock_threadgroup) { if (lock_threadgroup) percpu_up_write(&cgroup_threadgroup_rwsem); cpus_read_unlock(); } /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one. */ static void cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return; /* cgroup_threadgroup_rwsem protects racing against forks */ WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) return; mgctx->tset.nr_tasks++; list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, &mgctx->tset.dst_csets); } /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else task = list_next_entry(task, cg_list); if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; /* * This function may be called both before and * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ if (cset->mg_dst_cset) *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else *dst_cssp = cset->subsys[tset->ssid]; return task; } cset = list_next_entry(cset, mg_node); task = NULL; } return NULL; } /** * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; /* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); if (ret) { failed_ssid = ssid; goto out_cancel_attach; } } } while_each_subsys_mask(); } /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, * the task might require to change its state. */ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, to_cset->dfl_cgrp); put_css_set_locked(from_cset); } } spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete. */ tset->csets = &tset->dst_csets; if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); } } while_each_subsys_mask(); } ret = 0; goto out_release_tset; out_cancel_attach: if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { tset->ssid = ssid; ss->cancel_attach(tset); } } while_each_subsys_mask(); } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); /* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration. */ tset->nr_tasks = 0; tset->csets = &tset->src_csets; return ret; } /** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks. */ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0; /* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; /* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY; return 0; } /** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_src_preload_node); put_css_set_locked(cset); } list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } spin_unlock_irq(&css_set_lock); } /** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); /* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it. */ if (src_cset->dead) return; if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx. */ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); /* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both. */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; } src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_dst_preload_node)) list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); for_each_subsys(ss, ssid) if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) mgctx->ss_mask |= 1 << ssid; } return 0; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; /* * The following thread iteration should be inside an RCU critical * section to prevent tasks from being freed while taking the snapshot. * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); if (!ret) TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); /* * If we migrate a single thread, we don't care about threadgroup * stability. If the thread is `current`, it won't exit(2) under our * hands or change PID through exec(2). We exclude * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write * callers by cgroup_mutex. * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); *threadgroup_locked = pid || threadgroup; cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); goto out_unlock_threadgroup; } } else { tsk = current; } if (threadgroup) tsk = tsk->group_leader; /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup; } get_task_struct(tsk); goto out_unlock_rcu; out_unlock_threadgroup: cgroup_attach_unlock(*threadgroup_locked); *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; /* release reference from cgroup_procs_write_start() */ put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_puts(seq, ss->name); printed = true; } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } /* show controllers which are enabled for a given cgroup's children */ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } /** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; /* * As cgroup_update_dfl_csses() is only called by * cgroup_apply_control(). The csses associated with the * given cgrp will not be affected by changes made to * its subtree_control file. We can skip them. */ if (dsct == cgrp) continue; list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* * We need to write-lock threadgroup_rwsem while migrating tasks. * However, if there are no source csets for @cgrp, changing its * controllers isn't gonna produce any task migrations and the * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(has_tasks); return ret; } /** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; restart: cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); cgroup_put(dsct); goto restart; } } } /** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_dom_cgrp = dsct->dom_cgrp; } } /** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled. */ static void cgroup_propagate_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct->subtree_control, cgroup_ss_mask(dsct)); } } /** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->dom_cgrp = dsct->old_dom_cgrp; } } static bool css_visible(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; if (cgroup_control(cgrp) & (1 << ss->id)) return true; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) return false; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; } /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; if (!css) { css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css); } WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; } } } return 0; } /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose. */ static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!css) continue; WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } } } } /** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations. */ static int cgroup_apply_control(struct cgroup *cgrp) { int ret; cgroup_propagate_control(cgrp); ret = cgroup_apply_control_enable(cgrp); if (ret) return ret; /* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ return cgroup_update_dfl_csses(cgrp); } /** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info. */ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) { if (ret) { cgroup_restore_control(cgrp); cgroup_propagate_control(cgrp); } cgroup_apply_control_disable(cgrp); } static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) { u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0; /* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0; if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP; } else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree. */ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0; } /* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks. */ if (cgroup_has_tasks(cgrp)) return -EBUSY; return 0; } /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { enable |= 1 << ssid; disable &= ~(1 << ssid); } else if (*tok == '-') { disable |= 1 << ssid; enable &= ~(1 << ssid); } else { return -EINVAL; } break; } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } } } } if (!enable && !disable) { ret = 0; goto out_unlock; } ret = cgroup_vet_subtree_control_enable(cgrp, enable); if (ret) goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock; kernfs_activate(cgrp->kn); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it. */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); /* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0; /* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly. */ if (cgroup_is_populated(cgrp) || cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP; /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; /* * The following shouldn't cause actual migrations and should * always succeed. */ cgroup_save_control(cgrp); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) if (dsct == cgrp || cgroup_is_threaded(dsct)) dsct->dom_cgrp = dom_cgrp; ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; cgroup_finalize_control(cgrp, ret); return ret; } static int cgroup_type_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; if (cgroup_is_threaded(cgrp)) seq_puts(seq, "threaded\n"); else if (!cgroup_is_valid_domain(cgrp)) seq_puts(seq, "domain invalid\n"); else if (cgroup_is_thread_root(cgrp)) seq_puts(seq, "domain threaded\n"); else seq_puts(seq, "domain\n"); return 0; } static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int ret; /* only switching to threaded mode is supported */ if (strcmp(strstrip(buf), "threaded")) return -EINVAL; /* drain dying csses before we re-apply (threaded) subtree control */ cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; /* threaded can only be enabled */ ret = cgroup_enable_threaded(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_max_descendants_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int descendants = READ_ONCE(cgrp->max_descendants); if (descendants == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", descendants); return 0; } static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int descendants; ssize_t ret; buf = strstrip(buf); if (!strcmp(buf, "max")) { descendants = INT_MAX; } else { ret = kstrtoint(buf, 0, &descendants); if (ret) return ret; } if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_descendants = descendants; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_max_depth_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int depth = READ_ONCE(cgrp->max_depth); if (depth == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", depth); return 0; } static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int depth; buf = strstrip(buf); if (!strcmp(buf, "max")) { depth = INT_MAX; } else { ret = kstrtoint(buf, 0, &depth); if (ret) return ret; } if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_depth = depth; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_events_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); return 0; } static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); return 0; } #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); css = cgroup_css(cgrp, ss); if (css && !css_tryget_online(css)) css = NULL; rcu_read_unlock(); return css; } static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_extra_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_extra_stat_show(seq, css); css_put(css); return ret; } static int cgroup_local_stat_show(struct seq_file *seq, struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_local_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_local_stat_show(seq, css); css_put(css); return ret; } #endif static int cpu_stat_show(struct seq_file *seq, void *v) { int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } static int cpu_local_stat_show(struct seq_file *seq, void *v) { struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; #ifdef CONFIG_CGROUP_SCHED ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; cgroup_get(cgrp); cgroup_kn_unlock(of->kn); /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { cgroup_put(cgrp); return -EBUSY; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); } smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; } static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_CPU); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IRQ); } static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return pressure_write(of, buf, nbytes, PSI_IRQ); } #endif static int cgroup_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; struct psi_group *psi = cgroup_psi(cgrp); seq_printf(seq, "%d\n", psi->enabled); return 0; } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret; int enable; struct cgroup *cgrp; struct psi_group *psi; ret = kstrtoint(strstrip(buf), 0, &enable); if (ret) return ret; if (enable < 0 || enable > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; psi = cgroup_psi(cgrp); if (psi->enabled != enable) { int i; /* show or hide {cpu,memory,io,irq}.pressure files */ for (i = 0; i < NR_PSI_RESOURCES; i++) cgroup_file_show(&cgrp->psi_files[i], enable); psi->enabled = enable; if (enable) psi_cgroup_restart(psi); } cgroup_kn_unlock(of->kn); return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { struct cgroup_file_ctx *ctx = of->priv; return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static int cgroup_pressure_open(struct kernfs_open_file *of) { if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) return -EPERM; return 0; } static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) { if (static_branch_likely(&psi_disabled)) return false; return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } #else /* CONFIG_PSI */ bool cgroup_psi_enabled(void) { return false; } #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; seq_printf(seq, "%d\n", cgrp->freezer.freeze); return 0; } static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int freeze; ret = kstrtoint(strstrip(buf), 0, &freeze); if (ret) return ret; if (freeze < 0 || freeze > 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgroup_freeze(cgrp, freeze); cgroup_kn_unlock(of->kn); return nbytes; } static void __cgroup_kill(struct cgroup *cgrp) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); set_bit(CGRP_KILL, &cgrp->flags); spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); while ((task = css_task_iter_next(&it))) { /* Ignore kernel threads here. */ if (task->flags & PF_KTHREAD) continue; /* Skip tasks that are already dying. */ if (__fatal_signal_pending(task)) continue; send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); spin_lock_irq(&css_set_lock); clear_bit(CGRP_KILL, &cgrp->flags); spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) { struct cgroup_subsys_state *css; struct cgroup *dsct; lockdep_assert_held(&cgroup_mutex); cgroup_for_each_live_descendant_pre(dsct, css, cgrp) __cgroup_kill(dsct); } static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { ssize_t ret = 0; int kill; struct cgroup *cgrp; ret = kstrtoint(strstrip(buf), 0, &kill); if (ret) return ret; if (kill != 1) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; /* * Killing is a process directed operation, i.e. the whole thread-group * is taken down so act like we do for cgroup.procs and only make this * writable in non-threaded cgroups. */ if (cgroup_is_threaded(cgrp)) ret = -EOPNOTSUPP; else cgroup_kill(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); if (ret) { put_cgroup_ns(ctx->ns); kfree(ctx); } return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; if (!nbytes) return 0; /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * cgroup.procs and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) return cft->write(of, buf, nbytes, off); /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css(). */ rcu_read_lock(); css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) ret = cft->write_u64(css, cft, v); } else if (cft->write_s64) { long long v; ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); } else { ret = -EINVAL; } return ret ?: nbytes; } static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); return kernfs_generic_poll(of, pt); } static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { if (seq_cft(seq)->seq_stop) seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); if (cft->seq_show) return cft->seq_show(m, arg); if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else return -EINVAL; return 0; } static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, }; /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = current_fsuid(), .ia_gid = current_fsgid(), }; if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) return 0; return kernfs_setattr(kn, &iattr); } static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, notify_timer)); } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); if (ret) { kernfs_remove(kn); return ret; } if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); } return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; lockdep_assert_held(&cgroup_mutex); restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); cft_end = cft; is_add = false; goto restart; } } else { cgroup_rm_file(cgrp, cft); } } return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } if (is_add && !ret) kernfs_activate(root->kn); return ret; } static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { /* free copy for custom atomic_write_len, see init_cftypes() */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); if (cft->flags & __CFTYPE_ADDED) { ret = -EBUSY; break; } if (cft->seq_start) kf_ops = &cgroup_kf_ops; else kf_ops = &cgroup_kf_single_ops; /* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len. */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { ret = -ENOMEM; break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; cft->flags |= __CFTYPE_ADDED; } if (ret) cgroup_exit_cftypes(cfts); return ret; } static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); } /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered. */ int cgroup_rm_cftypes(struct cftype *cfts) { if (!cfts || cfts[0].name[0] == '\0') return 0; if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT; cgroup_lock(); cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return 0; } /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') return 0; ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); return ret; } /** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy. */ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_ONLY_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies. */ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset. */ void cgroup_file_notify(struct cgroup_file *cfile) { unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsigned long last = cfile->notified_at; unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; if (time_in_range(jiffies, last, next)) { timer_reduce(&cfile->notify_timer, next); } else { kernfs_notify(cfile->kn); cfile->notified_at = jiffies; } } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset * @show: whether to show or hide */ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; spin_lock_irq(&cgroup_file_kn_lock); kn = cfile->kn; kernfs_get(kn); spin_unlock_irq(&cgroup_file_kn_lock); if (kn) kernfs_show(kn, show); kernfs_put(kn); } /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk * * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is * that @parent and @pos are accessible. The next sibling is guaranteed to * be returned regardless of their states. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been unlinked from the sibling list. * Once a cgroup is removed, its ->sibling.next is no longer * updated when its next sibling changes. CSS_RELEASED is set when * @pos is taken off list, at which time its next pointer is valid, * and, as releases are serialized, the one pointed to by the next * pointer is guaranteed to not have started release yet. This * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically * increasing unique serial number and always appended to the * sibling list, the next one can be found by walking the parent's * children until the first css with higher serial number than * @pos's. While this path can be slower, it happens iff iteration * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { list_for_each_entry_rcu(next, &parent->children, sibling, lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is * the next sibling. */ if (&next->sibling != &parent->children) return next; return NULL; } /** * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { next = css_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; } return NULL; } EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css * @pos: css of interest * * Return the rightmost descendant of @pos. If there's no descendant, @pos * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct rightmost descendant as * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; cgroup_assert_mutex_or_rcu_locked(); do { last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last; do { last = pos; pos = css_next_child(NULL, pos); } while (pos); return last; } /** * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ return pos->parent; } /** * css_has_online_children - does a css have online children * @css: the target css * * Returns %true if @css has any online children; otherwise, %false. This * function can be called from any context but the caller is responsible * for synchronizing against on/offlining as necessary. */ bool css_has_online_children(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *child; bool ret = false; rcu_read_lock(); css_for_each_child(child, css) { if (child->flags & CSS_ONLINE) { ret = true; break; } } rcu_read_unlock(); return ret; } static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) { struct list_head *l; struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* find the next threaded cset */ if (it->tcset_pos) { l = it->tcset_pos->next; if (l != it->tcset_head) { it->tcset_pos = l; return container_of(l, struct css_set, threaded_csets_node); } it->tcset_pos = NULL; } /* find the next cset */ l = it->cset_pos; l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; return NULL; } if (it->ss) { cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); } else { link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } it->cset_pos = l; /* initialize threaded css_set walking */ if (it->flags & CSS_TASK_ITER_THREADED) { if (it->cur_dcset) put_css_set_locked(it->cur_dcset); it->cur_dcset = cset; get_css_set(cset); it->tcset_head = &cset->threaded_csets; it->tcset_pos = &cset->threaded_csets; } return cset; } /** * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set and find first non-empty tasks list*/ while ((cset = css_task_iter_next_css_set(it))) { if (!list_empty(&cset->tasks)) { it->cur_tasks_head = &cset->tasks; break; } else if (!list_empty(&cset->mg_tasks)) { it->cur_tasks_head = &cset->mg_tasks; break; } else if (!list_empty(&cset->dying_tasks)) { it->cur_tasks_head = &cset->dying_tasks; break; } } if (!cset) { it->task_pos = NULL; return; } it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus * need to take steps to ensure that iteration can be resumed after * the lock is re-acquired. Iteration is performed at two levels - * css_sets and tasks in them. * * Once created, a css_set never leaves its cgroup lists, so a * pinned css_set is guaranteed to stay put and we can resume * iteration afterwards. * * Tasks may leave @cset across iteration steps. This is resolved * by registering each iterator with the css_set currently being * walked and making css_set_move_task() advance iterators whose * next task is leaving. */ if (it->cur_cset) { list_del(&it->iters_node); put_css_set_locked(it->cur_cset); } get_css_set(cset); it->cur_cset = cset; list_add(&it->iters_node, &cset->task_iters); } static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task) { lockdep_assert_held(&css_set_lock); if (it->task_pos == &task->cg_list) { it->task_pos = it->task_pos->next; it->flags |= CSS_TASK_ITER_SKIPPED; } } static void css_task_iter_advance(struct css_task_iter *it) { struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { /* * Advance iterator to find next entry. We go through cset * tasks, mg_tasks and dying_tasks, when consumed we move onto * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; if (it->task_pos == &it->cur_cset->tasks) { it->cur_tasks_head = &it->cur_cset->mg_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->mg_tasks) { it->cur_tasks_head = &it->cur_cset->dying_tasks; it->task_pos = it->cur_tasks_head->next; } if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } if (!it->task_pos) return; task = list_entry(it->task_pos, struct task_struct, cg_list); if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ if (!thread_group_leader(task)) goto repeat; /* and dying leaders w/o live member threads */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); it->ss = css->ss; it->flags = flags; if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; it->cset_head = it->cset_pos; css_task_iter_advance(it); spin_unlock_irq(&css_set_lock); } /** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via css_task_iter_start(). Returns NULL when the iteration * reaches the end. */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } spin_lock_irq(&css_set_lock); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) css_task_iter_advance(it); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); get_task_struct(it->cur_task); css_task_iter_advance(it); } spin_unlock_irq(&css_set_lock); return it->cur_task; } /** * css_task_iter_end - finish task iteration * @it: the task iterator to finish * * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); spin_unlock_irq(&css_set_lock); } if (it->cur_dcset) put_css_set(it->cur_dcset); if (it->cur_task) put_task_struct(it->cur_task); } static void cgroup_procs_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; if (ctx->procs.started) css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_file_ctx *ctx = of->priv; struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); css_task_iter_start(&cgrp->self, iter_flags, it); ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); } else return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { struct cgroup *cgrp = seq_css(s)->cgroup; /* * All processes of a threaded subtree belong to the domain cgroup * of the subtree. Only threads can be distributed across the * subtree. Reject reads on cgroup.procs in the subtree proper. * They're always empty anyway. */ if (cgroup_is_threaded(cgrp)) return ERR_PTR(-EOPNOTSUPP); return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED); } static int cgroup_procs_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) { int ret; struct inode *inode; lockdep_assert_held(&cgroup_mutex); inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (!inode) return -ENOMEM; ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, struct cgroup_namespace *ns) { struct cgroup *com_cgrp = src_cgrp; int ret; lockdep_assert_held(&cgroup_mutex); /* find the common ancestor */ while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; /* * If namespaces are delegation boundaries, %current must be able * to see both source and destination cgroups from its namespace. */ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) return -ENOENT; return 0; } static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, bool threadgroup, struct cgroup_namespace *ns) { int ret = 0; ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; ret = cgroup_migrate_vet_dst(dst_cgrp); if (ret) return ret; if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) ret = -EOPNOTSUPP; return ret; } static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; ssize_t ret; bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* find the source cgroup */ spin_lock_irq(&css_set_lock); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); /* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks. */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); return ret; } static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, true) ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) { return __cgroup_procs_start(s, pos, 0); } static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup_procs_write(of, buf, false) ?: nbytes; } /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { .name = "cgroup.type", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_type_show, .write = cgroup_type_write, }, { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { .name = "cgroup.threads", .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_threads_write, }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, { .name = "cgroup.max.descendants", .seq_show = cgroup_max_descendants_show, .write = cgroup_max_descendants_write, }, { .name = "cgroup.max.depth", .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, .write = cgroup_freeze_write, }, { .name = "cgroup.kill", .flags = CFTYPE_NOT_ON_ROOT, .write = cgroup_kill_write, }, { .name = "cpu.stat", .seq_show = cpu_stat_show, }, { .name = "cpu.stat.local", .seq_show = cpu_local_stat_show, }, { } /* terminate */ }; static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #ifdef CONFIG_IRQ_TIME_ACCOUNTING { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, #endif { .name = "cgroup.pressure", .seq_show = cgroup_pressure_show, .write = cgroup_pressure_write, }, #endif /* CONFIG_PSI */ { } /* terminate */ }; /* * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be * offlined by invoking offline_css(). After offlining, the base ref is * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ static void css_free_rwork_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); if (ss) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); if (parent) css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* * This is root cgroup's refcnt reaching zero, * which indicates that the root should be * released. */ cgroup_destroy_root(cgrp->root); } } } static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); list_del_rcu(&css->rstat_css_node); } cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - * cgroupstats_build() and css_tryget_online_from_dir(). * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); if (css->parent) atomic_inc(&css->parent->online_cnt); } return ret; } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) return; if (ss->css_offline) ss->css_offline(css); css->flags &= ~CSS_ONLINE; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); } /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css * * Create a new css associated with @cgrp - @ss pair. On success, the new * css is online and installed in @cgrp. This function doesn't create the * interface files. Returns 0 on success, -errno on failure. */ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); if (!css) css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_css; css->id = err; /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) goto err_list_del; return css; err_list_del: list_del_rcu(&css->sibling); err_free_css: list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } /* * The returned cgroup is fully initialized including its control mask, but * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; int level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_stat_exit; } cgrp->kn = kn; init_cgroup_housekeeping(cgrp); cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; ret = psi_cgroup_alloc(cgrp); if (ret) goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) goto out_psi_free; /* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be * attached to the child cgroup, it will become frozen. * At this point the new cgroup is unpopulated, so we can * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); set_bit(CGRP_FROZEN, &cgrp->flags); } spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestors[tcgrp->level] = tcgrp; if (tcgrp != cgrp) { tcgrp->nr_descendants++; /* * If the new cgroup is frozen, all ancestor cgroups * get a new frozen descendant, but their state can't * change because of this. */ if (cgrp->freezer.e_freeze) tcgrp->freezer.nr_frozen_descendants++; } } spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); cgroup_propagate_control(cgrp); return cgrp; out_psi_free: psi_cgroup_free(cgrp); out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); return ERR_PTR(ret); } static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; int level = 1; lockdep_assert_held(&cgroup_mutex); for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; if (level > cgroup->max_depth) goto fail; level++; } ret = true; fail: return ret; } int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(name, '\n')) return -EINVAL; parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; if (!cgroup_check_hierarchy_limits(parent)) { ret = -EAGAIN; goto out_unlock; } cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; out_destroy: cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; } /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_lock(); do { offline_css(css); css_put(css); /* @css can't go away while we're holding cgroup_mutex */ css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } } /** * kill_css - destroy a css * @css: css to destroy * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked * asynchronously once css_tryget_online() is guaranteed to fail and when * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) return; css->flags |= CSS_DYING; /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). */ css_get(css); /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to * guarantee that css_tryget_online() won't succeed by the time * ->css_offline() is invoked. To satisfy all the requirements, * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of * css's. Set up so that the next stage will be kicked off once all * the percpu refcnts are confirmed to be killed. * * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the * rest of destruction. Once all cgroup references are gone, the * cgroup is RCU-freed. * * This function implements s1. After this step, @cgrp is gone as far as * the userland is concerned and a new cgroup with the same name may be * created. As cgroup doesn't care about the names internally, this * doesn't cause any problem. */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); /* * Only migration can raise populated from zero and we're already * holding cgroup_mutex. */ if (cgroup_is_populated(cgrp)) return -EBUSY; /* * Make sure there's no live children. We can't test emptiness of * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ if (css_has_online_children(&cgrp->self)) return -EBUSY; /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* * If the dying cgroup is frozen, decrease frozen descendants * counters of ancestor cgroups. */ if (test_bit(CGRP_FROZEN, &cgrp->flags)) tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); cgroup_bpf_offline(cgrp); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); return 0; }; int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; ret = cgroup_destroy_locked(cgrp); if (!ret) TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; pr_debug("Initializing cgroup subsys %s\n", ss->name); cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* * Root csses are never destroyed and we can't initialize * percpu_ref during early init. Disable refcnting. */ css->flags |= CSS_NO_REF; if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); BUG_ON(online_css(css)); cgroup_unlock(); } /** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init. */ int __init cgroup_init_early(void) { static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; ctx.root = &cgrp_dfl_root; init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); ss->id = i; ss->name = cgroup_subsys_name[i]; if (!ss->legacy_name) ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); } return 0; } /** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init. */ int __init cgroup_init(void) { struct cgroup_subsys *ss; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); cgroup_rstat_boot(); get_user_ns(init_cgroup_ns.user_ns); cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to * it during init. */ hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } else { cgroup_init_subsys(ss, false); } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); /* * Setting dfl_root subsys_mask needs to consider the * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ if (!cgroup_ssid_enabled(ssid)) continue; if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; /* implicit controllers must be threaded too */ WARN_ON(ss->implicit_on_dfl && !ss->threaded); if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->threaded) cgrp_dfl_threaded_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } if (ss->bind) ss->bind(init_css_set.subsys[ssid]); cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ hash_del(&init_css_set.hlist); hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); #ifdef CONFIG_CPUSETS WARN_ON(register_filesystem(&cpuset_fs_type)); #endif return 0; } static int __init cgroup_wq_init(void) { /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); return 0; } core_initcall(cgroup_wq_init); void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); kernfs_put(kn); } /* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * Only cgroups within current task's cgroup NS are valid. */ struct cgroup *cgroup_get_from_id(u64 id) { struct kernfs_node *kn; struct cgroup *cgrp, *root_cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return ERR_PTR(-ENOENT); if (kernfs_type(kn) != KERNFS_DIR) { kernfs_put(kn); return ERR_PTR(-ENOENT); } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; rcu_read_unlock(); kernfs_put(kn); if (!cgrp) return ERR_PTR(-ENOENT); root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); return ERR_PTR(-ENOENT); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { char *buf; int retval; struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; cgroup_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, * while a zombie doesn't show up in "cgroup.procs" and * thus can't be migrated, its /proc/PID/cgroup keeps * reporting the cgroup it belonged to before exiting. If * the cgroup is removed before the zombie is reaped, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; seq_puts(m, buf); } else { seq_puts(m, "/"); } if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else seq_putc(m, '\n'); } retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); cgroup_unlock(); kfree(buf); out: return retval; } /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } /** * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer * @f: file corresponding to cgroup_dir * * Find the cgroup from a file pointer associated with a cgroup directory. * Returns a pointer to the cgroup on success. ERR_PTR is returned if the * cgroup cannot be found. */ static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); return css->cgroup; } /** * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports * cgroup2. * @f: file corresponding to cgroup2_dir */ static struct cgroup *cgroup_get_from_file(struct file *f) { struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } /** * cgroup_css_set_fork - find or create a css_set for a child process * @kargs: the arguments passed to create the child process * * This functions finds or creates a new css_set which the child * process will be attached to in cgroup_post_fork(). By default, * the child process will be given the same css_set as its parent. * * If CLONE_INTO_CGROUP is specified this function will try to find an * existing css_set which includes the requested cgroup and if not create * a new css_set that the child will be attached to later. If this function * succeeds it will hold cgroup_threadgroup_rwsem on return. If * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex * before grabbing cgroup_threadgroup_rwsem and will hold a reference * to the target cgroup. */ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) { int ret; struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb; struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) cgroup_lock(); cgroup_threadgroup_change_begin(current); spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { kargs->cset = cset; return 0; } f = fget_raw(kargs->cgroup); if (!f) { ret = -EBADF; goto err; } sb = f->f_path.dentry->d_sb; dst_cgrp = cgroup_get_from_file(f); if (IS_ERR(dst_cgrp)) { ret = PTR_ERR(dst_cgrp); dst_cgrp = NULL; goto err; } if (cgroup_is_dead(dst_cgrp)) { ret = -ENODEV; goto err; } /* * Verify that we the target cgroup is writable for us. This is * usually done by the vfs layer but since we're not going through * the vfs layer here we need to do it "manually". */ ret = cgroup_may_write(dst_cgrp, sb); if (ret) goto err; /* * Spawning a task directly into a cgroup works by passing a file * descriptor to the target cgroup directory. This can even be an O_PATH * file descriptor. But it can never be a cgroup.procs file descriptor. * This was done on purpose so spawning into a cgroup could be * conceptualized as an atomic * * fd = openat(dfd_cgroup, "cgroup.procs", ...); * write(fd, <child-pid>, ...); * * sequence, i.e. it's a shorthand for the caller opening and writing * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us * to always use the caller's credentials. */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); if (ret) goto err; kargs->cset = find_css_set(cset, dst_cgrp); if (!kargs->cset) { ret = -ENOMEM; goto err; } put_css_set(cset); fput(f); kargs->cgrp = dst_cgrp; return ret; err: cgroup_threadgroup_change_end(current); cgroup_unlock(); if (f) fput(f); if (dst_cgrp) cgroup_put(dst_cgrp); put_css_set(cset); if (kargs->cset) put_css_set(kargs->cset); return ret; } /** * cgroup_css_set_put_fork - drop references we took during fork * @kargs: the arguments passed to create the child process * * Drop references to the prepared css_set and target cgroup if * CLONE_INTO_CGROUP was requested. */ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset; cgroup_threadgroup_change_end(current); if (cset) { put_css_set(cset); kargs->cset = NULL; } if (kargs->flags & CLONE_INTO_CGROUP) { cgroup_unlock(); if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; } } } /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks. */ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; ret = cgroup_css_set_fork(kargs); if (ret) return ret; do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); return 0; out_revert: for_each_subsys(ss, j) { if (j >= i) break; if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); } cgroup_css_set_put_fork(kargs); return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * @child: the child process * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) ss->cancel_fork(child, kargs->cset); cgroup_css_set_put_fork(kargs); } /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. */ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; cset = kargs->cset; kargs->cset = NULL; spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { if (kargs->cgrp) cgrp_flags = kargs->cgrp->flags; else cgrp_flags = cset->dfl_cgrp->flags; WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } else { put_css_set(cset); cset = NULL; } if (!(child->flags & PF_KTHREAD)) { if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { /* * If the cgroup has to be frozen, the new task has * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to * get the task into the frozen state. */ spin_lock(&child->sighand->siglock); WARN_ON_ONCE(child->frozen); child->jobctl |= JOBCTL_TRAP_FREEZE; spin_unlock(&child->sighand->siglock); /* * Calling cgroup_update_frozen() isn't required here, * because it will be called anyway a bit later from * do_freezer_trap(). So we avoid cgroup's transient * switch from the frozen state and back. */ } /* * If the cgroup is to be killed notice it now and take the * child down right after we finished preparing it for * userspace. */ kill = test_bit(CGRP_KILL, &cgrp_flags); } spin_unlock_irq(&css_set_lock); /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); /* Make the new cset the root_cset of the new cgroup namespace. */ if (kargs->flags & CLONE_NEWCGROUP) { struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; get_css_set(cset); child->nsproxy->cgroup_ns->root_cset = cset; put_css_set(rcset); } /* Cgroup has to be killed so take down child immediately. */ if (unlikely(kill)) do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); cgroup_css_set_put_fork(kargs); } /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; int i; spin_lock_irq(&css_set_lock); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; if (dl_task(tsk)) dec_dl_tasks_cs(tsk); WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); } while_each_subsys_mask(); } void cgroup_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); spin_lock_irq(&css_set_lock); css_set_skip_task_iters(task_css_set(task), task); list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); } static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; static_branch_disable(cgroup_subsys_enabled_key[i]); pr_info("Disabling %s control group subsystem\n", ss->name); } for (i = 0; i < OPT_FEATURE_COUNT; i++) { if (strcmp(token, cgroup_opt_feature_names[i])) continue; cgroup_feature_disable_mask |= 1 << i; pr_info("Disabling %s control group feature\n", cgroup_opt_feature_names[i]); break; } } return 1; } __setup("cgroup_disable=", cgroup_disable); void __init __weak enable_debug_cgroup(void) { } static int __init enable_cgroup_debug(char *str) { cgroup_debug = true; enable_debug_cgroup(); return 1; } __setup("cgroup_debug", enable_cgroup_debug); /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * * If @dentry is a directory for a cgroup which has @ss enabled on it, try * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); return css; } /** * css_from_id - lookup css by id * @id: the cgroup id * @ss: cgroup subsys to be looked into * * Returns the css if there's valid one with @id, otherwise returns NULL. * Should be called under rcu_read_lock(). */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&ss->css_idr, id); } /** * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path * @path: path on the default hierarchy * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp; root_cgrp = current_cgns_cgroup_dfl(); kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; if (kernfs_type(kn) != KERNFS_DIR) { cgrp = ERR_PTR(-ENOTDIR); goto out_kernfs; } rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); rcu_read_unlock(); out_kernfs: kernfs_put(kn); out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct fd f = fdget_raw(fd); if (!f.file) return ERR_PTR(-EBADF); cgrp = cgroup_v1v2_get_from_file(f.file); fdput(f); return cgrp; } /** * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports * cgroup2. * @fd: fd obtained by open(cgroup2_dir) */ struct cgroup *cgroup_get_from_fd(int fd) { struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgrp)) return ERR_CAST(cgrp); if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) { u64 v = 1; while (power--) v *= 10; return v; } /** * cgroup_parse_float - parse a floating number * @input: input string * @dec_shift: number of decimal digits to shift * @v: output * * Parse a decimal floating point number in @input and store the result in * @v with decimal point right shifted @dec_shift times. For example, if * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. * Returns 0 on success, -errno otherwise. * * There's nothing cgroup specific about this function except that it's * currently the only user. */ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) { s64 whole, frac = 0; int fstart = 0, fend = 0, flen; if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) return -EINVAL; if (frac < 0) return -EINVAL; flen = fend > fstart ? fend - fstart : 0; if (flen < dec_shift) frac *= power_of_ten(dec_shift - flen); else frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); *v = whole * power_of_ten(dec_shift) + frac; return 0; } /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { struct cgroup *cgroup; rcu_read_lock(); /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) { cgroup = &cgrp_dfl_root.cgrp; cgroup_get(cgroup); goto out; } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { cgroup = cset->dfl_cgrp; break; } cpu_relax(); } out: skcd->cgroup = cgroup; cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); /* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd. * Don't use cgroup_get_live(). */ cgroup_get(cgrp); cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); cgroup_bpf_put(cgrp); cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) { struct cftype *cft; ssize_t ret = 0; for (cft = files; cft && cft->name[0] != '\0'; cft++) { if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); if (WARN_ON(ret >= size)) break; } return ret; } static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct cgroup_subsys *ss; int ssid; ssize_t ret = 0; ret = show_delegatable_files(cgroup_base_files, buf + ret, PAGE_SIZE - ret, NULL); if (cgroup_psi_enabled()) ret += show_delegatable_files(cgroup_psi_files, buf + ret, PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, PAGE_SIZE - ret, cgroup_subsys_name[ssid]); return ret; } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, &cgroup_features_attr.attr, NULL, }; static const struct attribute_group cgroup_sysfs_attr_group = { .attrs = cgroup_sysfs_attrs, .name = "cgroup", }; static int __init cgroup_sysfs_init(void) { return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); #endif /* CONFIG_SYSFS */ |
| 29 29 29 29 27 28 28 28 6 6 6 6 6 5 6 29 29 29 29 29 29 7 28 23 28 28 28 28 27 1 28 27 6 29 29 6 6 2935 2936 2937 2912 2911 2911 3256 3257 3255 3256 3256 3257 3257 3257 3257 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } #ifdef CONFIG_NUMA static int kernfs_vma_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return 0; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = 0; if (of->vm_ops->set_policy) ret = of->vm_ops->set_policy(vma, new); kernfs_put_active(of->kn); return ret; } static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); struct mempolicy *pol; if (!of->vm_ops) return vma->vm_policy; if (!kernfs_get_active(of->kn)) return vma->vm_policy; pol = vma->vm_policy; if (of->vm_ops->get_policy) pol = of->vm_ops->get_policy(vma, addr); kernfs_put_active(of->kn); return pol; } #endif static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, #ifdef CONFIG_NUMA .set_policy = kernfs_vma_set_policy, .get_policy = kernfs_vma_get_policy, #endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_file_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * Both paths of the branch look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. */ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active(kn)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = generic_file_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; } |
| 235 235 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 | /* Copyright (c) 2018, Mellanox Technologies All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <crypto/aead.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/netdevice.h> #include <net/dst.h> #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> #include "tls.h" #include "trace.h" /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ static DECLARE_RWSEM(device_offload_lock); static struct workqueue_struct *destruct_wq __read_mostly; static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); static struct page *dummy_page; static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) { kfree(tls_offload_ctx_tx(ctx)); kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); } if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); tls_ctx_free(NULL, ctx); } static void tls_device_tx_del_task(struct work_struct *work) { struct tls_offload_context_tx *offload_ctx = container_of(work, struct tls_offload_context_tx, destruct_work); struct tls_context *ctx = offload_ctx->ctx; struct net_device *netdev; /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); ctx->netdev = NULL; tls_device_free_ctx(ctx); } static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { struct net_device *netdev; unsigned long flags; bool async_cleanup; spin_lock_irqsave(&tls_device_lock, flags); if (unlikely(!refcount_dec_and_test(&ctx->refcount))) { spin_unlock_irqrestore(&tls_device_lock, flags); return; } list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */ /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); async_cleanup = netdev && ctx->tx_conf == TLS_HW; if (async_cleanup) { struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx); /* queue_work inside the spinlock * to make sure tls_device_down waits for that work. */ queue_work(destruct_wq, &offload_ctx->destruct_work); } spin_unlock_irqrestore(&tls_device_lock, flags); if (!async_cleanup) tls_device_free_ctx(ctx); } /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { struct dst_entry *dst = sk_dst_get(sk); struct net_device *netdev = NULL; if (likely(dst)) { netdev = netdev_sk_get_lowest_dev(dst->dev, sk); dev_hold(netdev); } dst_release(dst); return netdev; } static void destroy_record(struct tls_record_info *record) { int i; for (i = 0; i < record->num_frags; i++) __skb_frag_unref(&record->frags[i], false); kfree(record); } static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { list_del(&info->list); destroy_record(info); } offload_ctx->retransmit_hint = NULL; } static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; if (info && !before(acked_seq, info->end_seq)) ctx->retransmit_hint = NULL; list_for_each_entry_safe(info, temp, &ctx->records_list, list) { if (before(acked_seq, info->end_seq)) break; list_del(&info->list); destroy_record(info); deleted_records++; } ctx->unacked_record_sn += deleted_records; spin_unlock_irqrestore(&ctx->lock, flags); } /* At this point, there should be no references on this * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); tls_ctx->sk_destruct(sk); if (tls_ctx->tx_conf == TLS_HW) { if (ctx->open_record) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); clean_acked_data_disable(inet_csk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_free_partial_record(sk, tls_ctx); } void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); } EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { struct net_device *netdev; struct sk_buff *skb; int err = 0; u8 *rcd_sn; skb = tcp_write_queue_tail(sk); if (skb) TCP_SKB_CB(skb)->eor = 1; rcd_sn = tls_ctx->tx.rec_seq; trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (netdev) err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_TX); up_read(&device_offload_lock); if (err) return; clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); } static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) { skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; if (skb_frag_page(frag) == pfrag->page && skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { skb_frag_size_add(frag, size); } else { ++frag; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, size); ++record->num_frags; get_page(pfrag->page); } pfrag->offset += size; record->len += size; } static int tls_push_record(struct sock *sk, struct tls_context *ctx, struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, int flags) { struct tls_prot_info *prot = &ctx->prot_info; struct tcp_sock *tp = tcp_sk(sk); skb_frag_t *frag; int i; record->end_seq = tp->write_seq + record->len; list_add_tail_rcu(&record->list, &offload_ctx->records_list); offload_ctx->open_record = NULL; if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) tls_device_resync_tx(sk, ctx, tp->write_seq); tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); /* all ready, send */ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } static void tls_device_record_close(struct sock *sk, struct tls_context *ctx, struct tls_record_info *record, struct page_frag *pfrag, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) * if we can't allocate memory now use the dummy page */ if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { dummy_tag_frag.page = dummy_page; dummy_tag_frag.offset = 0; pfrag = &dummy_tag_frag; } tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { struct tls_record_info *record; skb_frag_t *frag; record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) return -ENOMEM; frag = &record->frags[0]; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; record->num_frags = 1; record->len = prepend_size; offload_ctx->open_record = record; return 0; } static int tls_do_allocation(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { int ret; if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); if (ret) return ret; if (pfrag->size > pfrag->offset) return 0; } if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; return 0; } static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) { size_t pre_copy, nocache; pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1); if (pre_copy) { pre_copy = min(pre_copy, bytes); if (copy_from_iter(addr, pre_copy, i) != pre_copy) return -EFAULT; bytes -= pre_copy; addr += pre_copy; } nocache = round_down(bytes, SMP_CACHE_BYTES); if (copy_from_iter_nocache(addr, nocache, i) != nocache) return -EFAULT; bytes -= nocache; addr += nocache; if (bytes && copy_from_iter(addr, bytes, i) != bytes) return -EFAULT; return 0; } static int tls_push_data(struct sock *sk, struct iov_iter *iter, size_t size, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); struct tls_record_info *record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; bool more = false; bool done = false; int copy, rc = 0; long timeo; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) return -EINVAL; if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; tls_push_record_flags = flags | MSG_MORE; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); if (rc < 0) return rc; } pfrag = sk_page_frag(sk); /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; record = ctx->open_record; if (!record) break; handle_error: if (record_type != TLS_RECORD_TYPE_DATA) { /* avoid sending partial * record with type != * application_data */ size = orig_size; destroy_record(record); ctx->open_record = NULL; } else if (record->len > prot->prepend_size) { goto last_record; } break; } record = ctx->open_record; copy = min_t(size_t, size, max_open_record_len - record->len); if (copy && (flags & MSG_SPLICE_PAGES)) { struct page_frag zc_pfrag; struct page **pages = &zc_pfrag.page; size_t off; rc = iov_iter_extract_pages(iter, &pages, copy, 1, 0, &off); if (rc <= 0) { if (rc == 0) rc = -EIO; goto handle_error; } copy = rc; if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { iov_iter_revert(iter, copy); rc = -EIO; goto handle_error; } zc_pfrag.offset = off; zc_pfrag.size = copy; tls_append_frag(record, &zc_pfrag, copy); } else if (copy) { copy = min_t(size_t, copy, pfrag->size - pfrag->offset); rc = tls_device_copy_data(page_address(pfrag->page) + pfrag->offset, copy, iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); } size -= copy; if (!size) { last_record: tls_push_record_flags = flags; if (flags & MSG_MORE) { more = true; break; } done = true; } if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { tls_device_record_close(sk, tls_ctx, record, pfrag, record_type); rc = tls_push_record(sk, tls_ctx, ctx, record, tls_push_record_flags); if (rc < 0) break; } } while (!done); tls_ctx->pending_open_record_frags = more; if (orig_size - size > 0) rc = orig_size - size; return rc; } int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; if (!tls_ctx->zerocopy_sendfile) msg->msg_flags &= ~MSG_SPLICE_PAGES; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { rc = tls_process_cmsg(sk, msg, &record_type); if (rc) goto out; } rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags, record_type); out: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return rc; } void tls_device_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter iter = {}; if (!tls_is_partially_sent_record(tls_ctx)) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (tls_is_partially_sent_record(tls_ctx)) { iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0); tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA); } release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || before(seq, info->end_seq - info->len)) { /* if retransmit_hint is irrelevant start * from the beginning of the list */ info = list_first_entry_or_null(&context->records_list, struct tls_record_info, list); if (!info) return NULL; /* send the start_marker record if seq number is before the * tls offload start marker sequence number. This record is * required to handle TCP packets which are before TLS offload * started. * And if it's not start marker, look if this seq number * belongs to the list. */ if (likely(!tls_record_is_start_marker(info))) { /* we have the first record, get the last record to see * if this seq number belongs to the list. */ last = list_last_entry(&context->records_list, struct tls_record_info, list); if (!between(seq, tls_record_start_seq(info), last->end_seq)) return NULL; } record_sn = context->unacked_record_sn; } /* We just need the _rcu for the READ_ONCE() */ rcu_read_lock(); list_for_each_entry_from_rcu(info, &context->records_list, list) { if (before(seq, info->end_seq)) { if (!context->retransmit_hint || after(info->end_seq, context->retransmit_hint->end_seq)) { context->hint_record_sn = record_sn; context->retransmit_hint = info; } *p_record_sn = record_sn; goto exit_rcu_unlock; } record_sn++; } info = NULL; exit_rcu_unlock: rcu_read_unlock(); return info; } EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0); return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; WARN_ON_ONCE(sk->sk_write_pending); sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); rcu_read_lock(); netdev = rcu_dereference(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); rcu_read_unlock(); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); u16 i; *rcd_delta = 0; if (is_async) { /* shouldn't get to wraparound: * too long in async stage, something bad happened */ if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) return false; /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ if (before(*seq, req_seq)) return false; if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; resync_async->rcd_delta++; return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ for (i = 0; i < resync_async->loglen; i++) if (req_seq == resync_async->log[i] && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; resync_async->loglen = 0; resync_async->rcd_delta = 0; return true; } resync_async->loglen = 0; resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) return true; return false; } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) return; if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) return; prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); switch (rx_ctx->resync_type) { case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; is_req_pending = resync_req; if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: if (likely(!rx_ctx->resync_nh_do_now)) return; /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ sock_data = tcp_inq(sk); if (sock_data > rcd_len) { trace_tls_device_rx_resync_nh_delay(sk, sock_data, rcd_len); return; } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: resync_req = atomic64_read(&rx_ctx->resync_async->req); is_req_pending = resync_req; if (likely(!is_req_pending)) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, resync_req, &seq, &rcd_delta)) return; tls_bigint_subtract(rcd_sn, rcd_delta); break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); } static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, struct tls_offload_context_rx *ctx, struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm; /* device will request resyncs by itself based on stream scan */ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) return; /* already scheduled */ if (ctx->resync_nh_do_now) return; /* seen decrypted fragments since last fully-failed record */ if (ctx->resync_nh_reset) { ctx->resync_nh_reset = 0; ctx->resync_nh.decrypted_failed = 1; ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; return; } if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) return; /* doing resync, bump the next target in case it fails */ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) ctx->resync_nh.decrypted_tgt *= 2; else ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; rxm = strp_msg(skb); /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); tls_bigint_increment(rcd_sn, prot->rec_seq_size); tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, rcd_sn); } } static int tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); const struct tls_cipher_desc *cipher_desc; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; struct strp_msg *rxm; char *orig_buf, *buf; switch (tls_ctx->crypto_recv.info.cipher_type) { case TLS_CIPHER_AES_GCM_128: case TLS_CIPHER_AES_GCM_256: break; default: return -EINVAL; } cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type); rxm = strp_msg(tls_strp_msg(sw_ctx)); orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv, sk->sk_allocation); if (!orig_buf) return -ENOMEM; buf = orig_buf; err = tls_strp_msg_cow(sw_ctx); if (unlikely(err)) goto free_buf; skb = tls_strp_msg(sw_ctx); rxm = strp_msg(skb); offset = rxm->offset; sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv); err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv); if (err) goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, sg); if (err != -EBADMSG) goto free_buf; else err = 0; data_len = rxm->full_len - cipher_desc->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) { err = skb_store_bits(skb, offset, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; } pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { int frag_pos; /* Practically all frags must belong to msg if reencrypt * is needed with current strparser and coalescing logic, * but strparser may "get optimized", so let's be safe. */ if (pos + skb_iter->len <= offset) goto done_with_frag; if (pos >= data_len + rxm->offset) break; frag_pos = offset - pos; copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); if (skb_iter->decrypted) { err = skb_store_bits(skb_iter, frag_pos, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; done_with_frag: pos += skb_iter->len; } free_buf: kfree(orig_buf); return err; } int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb = tls_strp_msg(sw_ctx); struct strp_msg *rxm = strp_msg(skb); int is_decrypted, is_encrypted; if (!tls_strp_msg_mixed_decrypted(sw_ctx)) { is_decrypted = skb->decrypted; is_encrypted = !is_decrypted; } else { is_decrypted = 0; is_encrypted = 0; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, tls_ctx->rx.rec_seq, rxm->full_len, is_encrypted, is_decrypted); if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) { if (likely(is_encrypted || is_decrypted)) return is_decrypted; /* After tls_device_down disables the offload, the next SKB will * likely have initial fragments decrypted, and final ones not * decrypted. We need to reencrypt that single SKB. */ return tls_device_reencrypt(sk, tls_ctx); } /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ if (is_decrypted) { ctx->resync_nh_reset = 1; return is_decrypted; } if (is_encrypted) { tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); return 0; } ctx->resync_nh_reset = 1; return tls_device_reencrypt(sk, tls_ctx); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, struct net_device *netdev) { if (sk->sk_destruct != tls_device_sk_destruct) { refcount_set(&ctx->refcount, 1); dev_hold(netdev); RCU_INIT_POINTER(ctx->netdev, netdev); spin_lock_irq(&tls_device_lock); list_add_tail(&ctx->list, &tls_device_list); spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; const struct tls_cipher_desc *cipher_desc; struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; struct net_device *netdev; char *iv, *rec_seq; struct sk_buff *skb; __be64 rcd_sn; int rc; if (!ctx) return -EINVAL; if (ctx->priv_ctx_tx) return -EEXIST; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_TX)) { rc = -EOPNOTSUPP; goto release_netdev; } crypto_info = &ctx->crypto_send.info; if (crypto_info->version != TLS_1_2_VERSION) { rc = -EOPNOTSUPP; goto release_netdev; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc || !cipher_desc->offloadable) { rc = -EINVAL; goto release_netdev; } iv = crypto_info_iv(crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + cipher_desc->iv; prot->tag_size = cipher_desc->tag; prot->overhead_size = prot->prepend_size + prot->tag_size; prot->iv_size = cipher_desc->iv; prot->salt_size = cipher_desc->salt; ctx->tx.iv = kmalloc(cipher_desc->iv + cipher_desc->salt, GFP_KERNEL); if (!ctx->tx.iv) { rc = -ENOMEM; goto release_netdev; } memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); prot->rec_seq_size = cipher_desc->rec_seq; ctx->tx.rec_seq = kmemdup(rec_seq, cipher_desc->rec_seq, GFP_KERNEL); if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; } start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); if (!start_marker_record) { rc = -ENOMEM; goto free_rec_seq; } offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; } rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) goto free_offload_ctx; /* start at rec_seq - 1 to account for the start marker record */ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; start_marker_record->end_seq = tcp_sk(sk)->write_seq; start_marker_record->len = 0; start_marker_record->num_frags = 0; INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task); offload_ctx->ctx = ctx; INIT_LIST_HEAD(&offload_ctx->records_list); list_add_tail(&start_marker_record->list, &offload_ctx->records_list); spin_lock_init(&offload_ctx->lock); sg_init_table(offload_ctx->sg_tx_data, ARRAY_SIZE(offload_ctx->sg_tx_data)); clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. * So mark the last skb in the write queue as end of record. */ skb = tcp_write_queue_tail(sk); if (skb) TCP_SKB_CB(skb)->eor = 1; /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); /* following this assignment tls_is_skb_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); dev_put(netdev); return 0; release_lock: up_read(&device_offload_lock); clean_acked_data_disable(inet_csk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); ctx->priv_ctx_tx = NULL; free_marker_record: kfree(start_marker_record); free_rec_seq: kfree(ctx->tx.rec_seq); free_iv: kfree(ctx->tx.iv); release_netdev: dev_put(netdev); return rc; } int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); if (!context) { rc = -ENOMEM; goto release_lock; } context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, ctx, 0); if (rc) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); info = (void *)&ctx->crypto_recv.info; trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); dev_put(netdev); return 0; free_sw_resources: up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_lock: up_read(&device_offload_lock); release_netdev: dev_put(netdev); return rc; } void tls_device_offload_cleanup_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct net_device *netdev; down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (!netdev) goto out; netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); rcu_assign_pointer(tls_ctx->netdev, NULL); } else { set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); tls_sw_release_resources_rx(sk); } static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; unsigned long flags; LIST_HEAD(list); /* Request a write lock to block new offload attempts */ down_write(&device_offload_lock); spin_lock_irqsave(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { struct net_device *ctx_netdev = rcu_dereference_protected(ctx->netdev, lockdep_is_held(&device_offload_lock)); if (ctx_netdev != netdev || !refcount_inc_not_zero(&ctx->refcount)) continue; list_move(&ctx->list, &list); } spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { /* Stop offloaded TX and switch to the fallback. * tls_is_skb_tx_device_offloaded will return false. */ WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw); /* Stop the RX and TX resync. * tls_dev_resync must not be called after tls_dev_del. */ rcu_assign_pointer(ctx->netdev, NULL); /* Start skipping the RX resync logic completely. */ set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags); /* Sync with inflight packets. After this point: * TX: no non-encrypted packets will be passed to the driver. * RX: resync requests from the driver will be ignored. */ synchronize_net(); /* Release the offload context on the driver side. */ if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); if (ctx->rx_conf == TLS_HW && !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); dev_put(netdev); /* Move the context to a separate list for two reasons: * 1. When the context is deallocated, list_del is called. * 2. It's no longer an offloaded context, so we don't want to * run offload-specific code on this context. */ spin_lock_irqsave(&tls_device_lock, flags); list_move_tail(&ctx->list, &tls_device_down_list); spin_unlock_irqrestore(&tls_device_lock, flags); /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. * Now release the ref taken above. */ if (refcount_dec_and_test(&ctx->refcount)) { /* sk_destruct ran after tls_device_down took a ref, and * it returned early. Complete the destruction here. */ list_del(&ctx->list); tls_device_free_ctx(ctx); } } up_write(&device_offload_lock); flush_workqueue(destruct_wq); return NOTIFY_DONE; } static int tls_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!dev->tlsdev_ops && !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if (netif_is_bond_master(dev)) return NOTIFY_DONE; if ((dev->features & NETIF_F_HW_TLS_RX) && !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) return NOTIFY_DONE; else return NOTIFY_BAD; case NETDEV_DOWN: return tls_device_down(dev); } return NOTIFY_DONE; } static struct notifier_block tls_dev_notifier = { .notifier_call = tls_dev_event, }; int __init tls_device_init(void) { int err; dummy_page = alloc_page(GFP_KERNEL); if (!dummy_page) return -ENOMEM; destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; } err = register_netdevice_notifier(&tls_dev_notifier); if (err) goto err_destroy_wq; return 0; err_destroy_wq: destroy_workqueue(destruct_wq); err_free_dummy: put_page(dummy_page); return err; } void __exit tls_device_cleanup(void) { unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); put_page(dummy_page); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/posix_acl.h (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H #include <linux/bug.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <uapi/linux/posix_acl.h> struct user_namespace; struct posix_acl_entry { short e_tag; unsigned short e_perm; union { kuid_t e_uid; kgid_t e_gid; }; }; struct posix_acl { refcount_t a_refcount; struct rcu_head a_rcu; unsigned int a_count; struct posix_acl_entry a_entries[]; }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++) /* * Duplicate an ACL handle. */ static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) { if (acl) refcount_inc(&acl->a_refcount); return acl; } /* * Free an ACL handle. */ static inline void posix_acl_release(struct posix_acl *acl) { if (acl && refcount_dec_and_test(&acl->a_refcount)) kfree_rcu(acl, a_rcu); } /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) { inode->i_acl = NULL; inode->i_default_acl = NULL; } int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; } #define simple_set_acl NULL static inline int simple_acl_create(struct inode *dir, struct inode *inode) { return 0; } static inline void cache_no_acl(struct inode *inode) { } static inline int posix_acl_create(struct inode *inode, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { *default_acl = *acl = NULL; return 0; } static inline void forget_all_cached_acls(struct inode *inode) { } static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; } static inline int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { return 0; } #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */ |
| 6 6 6 6 5 5 5 5 5 2 2 2 2 2 2 2 1 6 6 5 5 6 5 3 3 1 6 6 6 6 6 6 6 6 9 9 8 6 6 6 6 6 4 4 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 | // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the VoIP USB phones with CM109 chipsets. * * Copyright (C) 2007 - 2008 Alfred E. Heggestad <aeh@db.org> */ /* * Tested devices: * - Komunikate KIP1000 * - Genius G-talk * - Allied-Telesis Corega USBPH01 * - ... * * This driver is based on the yealink.c driver * * Thanks to: * - Authors of yealink.c * - Thomas Reitmayr * - Oliver Neukum for good review comments and code * - Shaun Jackman <sjackman@gmail.com> for Genius G-talk keymap * - Dmitry Torokhov for valuable input and review * * Todo: * - Read/write EEPROM */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/rwsem.h> #include <linux/usb/input.h> #define DRIVER_VERSION "20080805" #define DRIVER_AUTHOR "Alfred E. Heggestad" #define DRIVER_DESC "CM109 phone driver" static char *phone = "kip1000"; module_param(phone, charp, S_IRUSR); MODULE_PARM_DESC(phone, "Phone name {kip1000, gtalk, usbph01, atcom}"); enum { /* HID Registers */ HID_IR0 = 0x00, /* Record/Playback-mute button, Volume up/down */ HID_IR1 = 0x01, /* GPI, generic registers or EEPROM_DATA0 */ HID_IR2 = 0x02, /* Generic registers or EEPROM_DATA1 */ HID_IR3 = 0x03, /* Generic registers or EEPROM_CTRL */ HID_OR0 = 0x00, /* Mapping control, buzzer, SPDIF (offset 0x04) */ HID_OR1 = 0x01, /* GPO - General Purpose Output */ HID_OR2 = 0x02, /* Set GPIO to input/output mode */ HID_OR3 = 0x03, /* SPDIF status channel or EEPROM_CTRL */ /* HID_IR0 */ RECORD_MUTE = 1 << 3, PLAYBACK_MUTE = 1 << 2, VOLUME_DOWN = 1 << 1, VOLUME_UP = 1 << 0, /* HID_OR0 */ /* bits 7-6 0: HID_OR1-2 are used for GPO; HID_OR0, 3 are used for buzzer and SPDIF 1: HID_OR0-3 are used as generic HID registers 2: Values written to HID_OR0-3 are also mapped to MCU_CTRL, EEPROM_DATA0-1, EEPROM_CTRL (see Note) 3: Reserved */ HID_OR_GPO_BUZ_SPDIF = 0 << 6, HID_OR_GENERIC_HID_REG = 1 << 6, HID_OR_MAP_MCU_EEPROM = 2 << 6, BUZZER_ON = 1 << 5, /* up to 256 normal keys, up to 15 special key combinations */ KEYMAP_SIZE = 256 + 15, }; /* CM109 protocol packet */ struct cm109_ctl_packet { u8 byte[4]; } __attribute__ ((packed)); enum { USB_PKT_LEN = sizeof(struct cm109_ctl_packet) }; /* CM109 device structure */ struct cm109_dev { struct input_dev *idev; /* input device */ struct usb_device *udev; /* usb device */ struct usb_interface *intf; /* irq input channel */ struct cm109_ctl_packet *irq_data; dma_addr_t irq_dma; struct urb *urb_irq; /* control output channel */ struct cm109_ctl_packet *ctl_data; dma_addr_t ctl_dma; struct usb_ctrlrequest *ctl_req; struct urb *urb_ctl; /* * The 3 bitfields below are protected by ctl_submit_lock. * They have to be separate since they are accessed from IRQ * context. */ unsigned irq_urb_pending:1; /* irq_urb is in flight */ unsigned ctl_urb_pending:1; /* ctl_urb is in flight */ unsigned buzzer_pending:1; /* need to issue buzz command */ spinlock_t ctl_submit_lock; unsigned char buzzer_state; /* on/off */ /* flags */ unsigned open:1; unsigned resetting:1; unsigned shutdown:1; /* This mutex protects writes to the above flags */ struct mutex pm_mutex; unsigned short keymap[KEYMAP_SIZE]; char phys[64]; /* physical device path */ int key_code; /* last reported key */ int keybit; /* 0=new scan 1,2,4,8=scan columns */ u8 gpi; /* Cached value of GPI (high nibble) */ }; /****************************************************************************** * CM109 key interface *****************************************************************************/ static unsigned short special_keymap(int code) { if (code > 0xff) { switch (code - 0xff) { case RECORD_MUTE: return KEY_MICMUTE; case PLAYBACK_MUTE: return KEY_MUTE; case VOLUME_DOWN: return KEY_VOLUMEDOWN; case VOLUME_UP: return KEY_VOLUMEUP; } } return KEY_RESERVED; } /* Map device buttons to internal key events. * * The "up" and "down" keys, are symbolised by arrows on the button. * The "pickup" and "hangup" keys are symbolised by a green and red phone * on the button. Komunikate KIP1000 Keyboard Matrix -> -- 1 -- 2 -- 3 --> GPI pin 4 (0x10) | | | | <- -- 4 -- 5 -- 6 --> GPI pin 5 (0x20) | | | | END - 7 -- 8 -- 9 --> GPI pin 6 (0x40) | | | | OK -- * -- 0 -- # --> GPI pin 7 (0x80) | | | | /|\ /|\ /|\ /|\ | | | | GPO pin: 3 2 1 0 0x8 0x4 0x2 0x1 */ static unsigned short keymap_kip1000(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x14: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x11: return KEY_NUMERIC_3; /* 3 */ case 0x24: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x21: return KEY_NUMERIC_6; /* 6 */ case 0x44: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x41: return KEY_NUMERIC_9; /* 9 */ case 0x81: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x88: return KEY_ENTER; /* pickup */ case 0x48: return KEY_ESC; /* hangup */ case 0x28: return KEY_LEFT; /* IN */ case 0x18: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* Contributed by Shaun Jackman <sjackman@gmail.com> Genius G-Talk keyboard matrix 0 1 2 3 4: 0 4 8 Talk 5: 1 5 9 End 6: 2 6 # Up 7: 3 7 * Down */ static unsigned short keymap_gtalk(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; case 0x21: return KEY_NUMERIC_1; case 0x41: return KEY_NUMERIC_2; case 0x81: return KEY_NUMERIC_3; case 0x12: return KEY_NUMERIC_4; case 0x22: return KEY_NUMERIC_5; case 0x42: return KEY_NUMERIC_6; case 0x82: return KEY_NUMERIC_7; case 0x14: return KEY_NUMERIC_8; case 0x24: return KEY_NUMERIC_9; case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* Talk (green handset) */ case 0x28: return KEY_ESC; /* End (red handset) */ case 0x48: return KEY_UP; /* Menu up (rocker switch) */ case 0x88: return KEY_DOWN; /* Menu down (rocker switch) */ default: return special_keymap(scancode); } } /* * Keymap for Allied-Telesis Corega USBPH01 * http://www.alliedtelesis-corega.com/2/1344/1437/1360/chprd.html * * Contributed by july@nat.bg */ static unsigned short keymap_usbph01(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; /* 0 */ case 0x21: return KEY_NUMERIC_1; /* 1 */ case 0x41: return KEY_NUMERIC_2; /* 2 */ case 0x81: return KEY_NUMERIC_3; /* 3 */ case 0x12: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x42: return KEY_NUMERIC_6; /* 6 */ case 0x82: return KEY_NUMERIC_7; /* 7 */ case 0x14: return KEY_NUMERIC_8; /* 8 */ case 0x24: return KEY_NUMERIC_9; /* 9 */ case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* IN */ case 0x88: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* * Keymap for ATCom AU-100 * http://www.atcom.cn/products.html * http://www.packetizer.com/products/au100/ * http://www.voip-info.org/wiki/view/AU-100 * * Contributed by daniel@gimpelevich.san-francisco.ca.us */ static unsigned short keymap_atcom(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x11: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x14: return KEY_NUMERIC_3; /* 3 */ case 0x21: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x24: return KEY_NUMERIC_6; /* 6 */ case 0x41: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x44: return KEY_NUMERIC_9; /* 9 */ case 0x84: return KEY_NUMERIC_POUND; /* # */ case 0x81: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* left arrow */ case 0x88: return KEY_RIGHT; /* right arrow */ default: return special_keymap(scancode); } } static unsigned short (*keymap)(int) = keymap_kip1000; /* * Completes a request by converting the data into events for the * input subsystem. */ static void report_key(struct cm109_dev *dev, int key) { struct input_dev *idev = dev->idev; if (dev->key_code >= 0) { /* old key up */ input_report_key(idev, dev->key_code, 0); } dev->key_code = key; if (key >= 0) { /* new valid key */ input_report_key(idev, key, 1); } input_sync(idev); } /* * Converts data of special key presses (volume, mute) into events * for the input subsystem, sends press-n-release for mute keys. */ static void cm109_report_special(struct cm109_dev *dev) { static const u8 autorelease = RECORD_MUTE | PLAYBACK_MUTE; struct input_dev *idev = dev->idev; u8 data = dev->irq_data->byte[HID_IR0]; unsigned short keycode; int i; for (i = 0; i < 4; i++) { keycode = dev->keymap[0xff + BIT(i)]; if (keycode == KEY_RESERVED) continue; input_report_key(idev, keycode, data & BIT(i)); if (data & autorelease & BIT(i)) { input_sync(idev); input_report_key(idev, keycode, 0); } } input_sync(idev); } /****************************************************************************** * CM109 usb communication interface *****************************************************************************/ static void cm109_submit_buzz_toggle(struct cm109_dev *dev) { int error; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } /* * IRQ handler */ static void cm109_urb_irq_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; unsigned long flags; dev_dbg(&dev->intf->dev, "### URB IRQ: [0x%02x 0x%02x 0x%02x 0x%02x] keybit=0x%02x\n", dev->irq_data->byte[0], dev->irq_data->byte[1], dev->irq_data->byte[2], dev->irq_data->byte[3], dev->keybit); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); goto out; } /* Special keys */ cm109_report_special(dev); /* Scan key column */ if (dev->keybit == 0xf) { /* Any changes ? */ if ((dev->gpi & 0xf0) == (dev->irq_data->byte[HID_IR1] & 0xf0)) goto out; dev->gpi = dev->irq_data->byte[HID_IR1] & 0xf0; dev->keybit = 0x1; } else { report_key(dev, dev->keymap[dev->irq_data->byte[HID_IR1]]); dev->keybit <<= 1; if (dev->keybit > 0x8) dev->keybit = 0xf; } out: spin_lock_irqsave(&dev->ctl_submit_lock, flags); dev->irq_urb_pending = 0; if (likely(!dev->shutdown)) { if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_urb_ctl_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; unsigned long flags; dev_dbg(&dev->intf->dev, "### URB CTL: [0x%02x 0x%02x 0x%02x 0x%02x]\n", dev->ctl_data->byte[0], dev->ctl_data->byte[1], dev->ctl_data->byte[2], dev->ctl_data->byte[3]); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); } spin_lock_irqsave(&dev->ctl_submit_lock, flags); dev->ctl_urb_pending = 0; if (likely(!dev->shutdown)) { if (dev->buzzer_pending || status) { dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } else if (likely(!dev->irq_urb_pending)) { /* ask for key data */ dev->irq_urb_pending = 1; error = usb_submit_urb(dev->urb_irq, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_irq) failed %d\n", __func__, error); } } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_toggle_buzzer_async(struct cm109_dev *dev) { unsigned long flags; spin_lock_irqsave(&dev->ctl_submit_lock, flags); if (dev->ctl_urb_pending) { /* URB completion will resubmit */ dev->buzzer_pending = 1; } else { dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_toggle_buzzer_sync(struct cm109_dev *dev, int on) { int error; if (on) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), dev->ctl_req->bRequest, dev->ctl_req->bRequestType, le16_to_cpu(dev->ctl_req->wValue), le16_to_cpu(dev->ctl_req->wIndex), dev->ctl_data, USB_PKT_LEN, USB_CTRL_SET_TIMEOUT); if (error < 0 && error != -EINTR) dev_err(&dev->intf->dev, "%s: usb_control_msg() failed %d\n", __func__, error); } static void cm109_stop_traffic(struct cm109_dev *dev) { dev->shutdown = 1; /* * Make sure other CPUs see this */ smp_wmb(); usb_kill_urb(dev->urb_ctl); usb_kill_urb(dev->urb_irq); cm109_toggle_buzzer_sync(dev, 0); dev->shutdown = 0; smp_wmb(); } static void cm109_restore_state(struct cm109_dev *dev) { if (dev->open) { /* * Restore buzzer state. * This will also kick regular URB submission */ cm109_toggle_buzzer_async(dev); } } /****************************************************************************** * input event interface *****************************************************************************/ static int cm109_input_open(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); int error; error = usb_autopm_get_interface(dev->intf); if (error < 0) { dev_err(&idev->dev, "%s - cannot autoresume, result %d\n", __func__, error); return error; } mutex_lock(&dev->pm_mutex); dev->buzzer_state = 0; dev->key_code = -1; /* no keys pressed */ dev->keybit = 0xf; /* issue INIT */ dev->ctl_data->byte[HID_OR0] = HID_OR_GPO_BUZ_SPDIF; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->ctl_data->byte[HID_OR3] = 0x00; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_KERNEL); if (error) { dev->ctl_urb_pending = 0; dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } else { dev->open = 1; } mutex_unlock(&dev->pm_mutex); if (error) usb_autopm_put_interface(dev->intf); return error; } static void cm109_input_close(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); mutex_lock(&dev->pm_mutex); /* * Once we are here event delivery is stopped so we * don't need to worry about someone starting buzzer * again */ cm109_stop_traffic(dev); dev->open = 0; mutex_unlock(&dev->pm_mutex); usb_autopm_put_interface(dev->intf); } static int cm109_input_ev(struct input_dev *idev, unsigned int type, unsigned int code, int value) { struct cm109_dev *dev = input_get_drvdata(idev); dev_dbg(&dev->intf->dev, "input_ev: type=%u code=%u value=%d\n", type, code, value); if (type != EV_SND) return -EINVAL; switch (code) { case SND_TONE: case SND_BELL: dev->buzzer_state = !!value; if (!dev->resetting) cm109_toggle_buzzer_async(dev); return 0; default: return -EINVAL; } } /****************************************************************************** * Linux interface and usb initialisation *****************************************************************************/ struct driver_info { char *name; }; static const struct driver_info info_cm109 = { .name = "CM109 USB driver", }; enum { VENDOR_ID = 0x0d8c, /* C-Media Electronics */ PRODUCT_ID_CM109 = 0x000e, /* CM109 defines range 0x0008 - 0x000f */ }; /* table of devices that work with this driver */ static const struct usb_device_id cm109_usb_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = VENDOR_ID, .idProduct = PRODUCT_ID_CM109, .bInterfaceClass = USB_CLASS_HID, .bInterfaceSubClass = 0, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t) &info_cm109 }, /* you can add more devices here with product ID 0x0008 - 0x000f */ { } }; static void cm109_usb_cleanup(struct cm109_dev *dev) { kfree(dev->ctl_req); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->ctl_data, dev->ctl_dma); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->irq_data, dev->irq_dma); usb_free_urb(dev->urb_irq); /* parameter validation in core/urb */ usb_free_urb(dev->urb_ctl); /* parameter validation in core/urb */ kfree(dev); } static void cm109_usb_disconnect(struct usb_interface *interface) { struct cm109_dev *dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); input_unregister_device(dev->idev); cm109_usb_cleanup(dev); } static int cm109_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct driver_info *nfo = (struct driver_info *)id->driver_info; struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct cm109_dev *dev; struct input_dev *input_dev = NULL; int ret, pipe, i; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -ENODEV; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -ENODEV; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; spin_lock_init(&dev->ctl_submit_lock); mutex_init(&dev->pm_mutex); dev->udev = udev; dev->intf = intf; dev->idev = input_dev = input_allocate_device(); if (!input_dev) goto err_out; /* allocate usb buffers */ dev->irq_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->irq_dma); if (!dev->irq_data) goto err_out; dev->ctl_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->ctl_dma); if (!dev->ctl_data) goto err_out; dev->ctl_req = kmalloc(sizeof(*(dev->ctl_req)), GFP_KERNEL); if (!dev->ctl_req) goto err_out; /* allocate urb structures */ dev->urb_irq = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_irq) goto err_out; dev->urb_ctl = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_ctl) goto err_out; /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); ret = usb_maxpacket(udev, pipe); if (ret != USB_PKT_LEN) dev_err(&intf->dev, "invalid payload size %d, expected %d\n", ret, USB_PKT_LEN); /* initialise irq urb */ usb_fill_int_urb(dev->urb_irq, udev, pipe, dev->irq_data, USB_PKT_LEN, cm109_urb_irq_callback, dev, endpoint->bInterval); dev->urb_irq->transfer_dma = dev->irq_dma; dev->urb_irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_irq->dev = udev; /* initialise ctl urb */ dev->ctl_req->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT; dev->ctl_req->bRequest = USB_REQ_SET_CONFIGURATION; dev->ctl_req->wValue = cpu_to_le16(0x200); dev->ctl_req->wIndex = cpu_to_le16(interface->desc.bInterfaceNumber); dev->ctl_req->wLength = cpu_to_le16(USB_PKT_LEN); usb_fill_control_urb(dev->urb_ctl, udev, usb_sndctrlpipe(udev, 0), (void *)dev->ctl_req, dev->ctl_data, USB_PKT_LEN, cm109_urb_ctl_callback, dev); dev->urb_ctl->transfer_dma = dev->ctl_dma; dev->urb_ctl->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_ctl->dev = udev; /* find out the physical bus location */ usb_make_path(udev, dev->phys, sizeof(dev->phys)); strlcat(dev->phys, "/input0", sizeof(dev->phys)); /* register settings for the input device */ input_dev->name = nfo->name; input_dev->phys = dev->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, dev); input_dev->open = cm109_input_open; input_dev->close = cm109_input_close; input_dev->event = cm109_input_ev; input_dev->keycode = dev->keymap; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(dev->keymap); input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_SND); input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); /* register available key events */ for (i = 0; i < KEYMAP_SIZE; i++) { unsigned short k = keymap(i); dev->keymap[i] = k; __set_bit(k, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); error = input_register_device(dev->idev); if (error) goto err_out; usb_set_intfdata(intf, dev); return 0; err_out: input_free_device(input_dev); cm109_usb_cleanup(dev); return error; } static int cm109_usb_suspend(struct usb_interface *intf, pm_message_t message) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_suspend (event=%d)\n", message.event); mutex_lock(&dev->pm_mutex); cm109_stop_traffic(dev); mutex_unlock(&dev->pm_mutex); return 0; } static int cm109_usb_resume(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_resume\n"); mutex_lock(&dev->pm_mutex); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static int cm109_usb_pre_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); mutex_lock(&dev->pm_mutex); /* * Make sure input events don't try to toggle buzzer * while we are resetting */ dev->resetting = 1; smp_wmb(); cm109_stop_traffic(dev); return 0; } static int cm109_usb_post_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev->resetting = 0; smp_wmb(); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static struct usb_driver cm109_driver = { .name = "cm109", .probe = cm109_usb_probe, .disconnect = cm109_usb_disconnect, .suspend = cm109_usb_suspend, .resume = cm109_usb_resume, .reset_resume = cm109_usb_resume, .pre_reset = cm109_usb_pre_reset, .post_reset = cm109_usb_post_reset, .id_table = cm109_usb_table, .supports_autosuspend = 1, }; static int __init cm109_select_keymap(void) { /* Load the phone keymap */ if (!strcasecmp(phone, "kip1000")) { keymap = keymap_kip1000; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Komunikate KIP1000 phone loaded\n"); } else if (!strcasecmp(phone, "gtalk")) { keymap = keymap_gtalk; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Genius G-talk phone loaded\n"); } else if (!strcasecmp(phone, "usbph01")) { keymap = keymap_usbph01; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Allied-Telesis Corega USBPH01 phone loaded\n"); } else if (!strcasecmp(phone, "atcom")) { keymap = keymap_atcom; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for ATCom AU-100 phone loaded\n"); } else { printk(KERN_ERR KBUILD_MODNAME ": " "Unsupported phone: %s\n", phone); return -EINVAL; } return 0; } static int __init cm109_init(void) { int err; err = cm109_select_keymap(); if (err) return err; err = usb_register(&cm109_driver); if (err) return err; printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_DESC ": " DRIVER_VERSION " (C) " DRIVER_AUTHOR "\n"); return 0; } static void __exit cm109_exit(void) { usb_deregister(&cm109_driver); } module_init(cm109_init); module_exit(cm109_exit); MODULE_DEVICE_TABLE(usb, cm109_usb_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
| 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #ifndef _LRU_LIST_H #define _LRU_LIST_H #include <linux/list.h> #include <linux/nodemask.h> #include <linux/shrinker.h> #include <linux/xarray.h> struct mem_cgroup; /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ LRU_REMOVED_RETRY, /* item removed, but lock has been dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ }; struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; }; struct list_lru_memcg { struct rcu_head rcu; /* array of per cgroup per node lists, indexed by node id */ struct list_lru_one node[]; }; struct list_lru_node { /* protects all lists on the node, including per cgroup */ spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; long nr_items; } ____cacheline_aligned_in_smp; struct list_lru { struct list_lru_node *node; #ifdef CONFIG_MEMCG_KMEM struct list_head list; int shrinker_id; bool memcg_aware; struct xarray xa; #endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker); #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) #define list_lru_init_key(lru, key) \ __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent); /** * list_lru_add: add an element to the lru list's tail * @list_lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or * not the element already belongs in the list and is allowed to lazy update * it. Note however that this is valid for *a* list, not *this* list. If * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it * to @list_lru * * Return value: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list * @list_lru: the lru pointer * @item: the item to be deleted. * * This function works analogously as list_lru_add in terms of list * manipulation. The comments about an element already pertaining to * a list are also valid for list_lru_del. * * Return value: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. * @nid: the node id to count from. * @memcg: the cgroup to count from. * * Always return a non-negative number, 0 for empty lists. There is no * guarantee that the list is not updated while the count is being computed. * Callers that want such a guarantee need to provide an outer lock. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); unsigned long list_lru_count_node(struct list_lru *lru, int nid); static inline unsigned long list_lru_shrink_count(struct list_lru *lru, struct shrink_control *sc) { return list_lru_count_one(lru, sc->nid, sc->memcg); } static inline unsigned long list_lru_count(struct list_lru *lru) { long count = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) count += list_lru_count_node(lru, nid); return count; } void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is responsible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * This function will scan all elements in a particular list_lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback * will return an enum lru_status telling the list_lru infrastructure what to * do with the object being scanned. * * Please note that nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * * Return value: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is responsible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * Same as @list_lru_walk_one except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); static inline unsigned long list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) { long isolated = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) { isolated += list_lru_walk_node(lru, nid, isolate, cb_arg, &nr_to_walk); if (nr_to_walk <= 0) break; } return isolated; } #endif /* _LRU_LIST_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SMP_H #define __LINUX_SMP_H /* * Generic SMP support * Alan Cox. <alan@redhat.com> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/list.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/smp_types.h> typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { struct __call_single_node node; smp_call_func_t func; void *info; }; #define CSD_INIT(_func, _info) \ (struct __call_single_data){ .func = (_func), .info = (_info), } /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); #define INIT_CSD(_csd, _func, _info) \ do { \ *(_csd) = CSD_INIT((_func), (_info)); \ } while (0) /* * Enqueue a llist_node on the call_single_queue; be very careful, read * flush_smp_call_function_queue() in detail. */ extern void __smp_call_single_queue(int cpu, struct llist_node *node); /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); int smp_call_function_single_async(int cpu, struct __call_single_data *csd); /* * Cpus stopping functions in panic. All have default weak definitions. * Architecture-dependent code may override them. */ void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); /* * Call a function on all processors */ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) { on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask); } /** * on_each_cpu_mask(): Run a function on processors specified by * cpumask, which may include the local processor. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. The * exception is that it may be used during early boot while * early_boot_irqs_disabled is set. */ static inline void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(NULL, func, info, wait, mask); } /* * Call a function on each processor for which the supplied function * cond_func returns a positive value. This may include the local * processor. May be used during early boot while early_boot_irqs_disabled is * set. Use local_irq_save/restore() instead of local_irq_disable/enable(). */ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } #ifdef CONFIG_SMP #include <linux/preempt.h> #include <linux/compiler.h> #include <linux/thread_info.h> #include <asm/smp.h> /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. * (defined in asm header): */ /* * stops all CPUs but the current one: */ extern void smp_send_stop(void); /* * sends a 'reschedule' event to another CPU: */ extern void arch_smp_send_reschedule(int cpu); /* * scheduler_ipi() is inline so can't be passed as callback reason, but the * callsite IP should be sufficient for root-causing IPIs sent from here. */ #define smp_send_reschedule(cpu) ({ \ trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ arch_smp_send_reschedule(cpu); \ }) /* * Prepare machine for booting other CPUs. */ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Bring a CPU up */ extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle); /* * Final polishing of CPUs */ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ void smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); /* * Generic and arch helpers */ void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ void smp_prepare_boot_cpu(void); extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); extern int __boot_cpu_id; static inline int get_boot_cpu_id(void) { return __boot_cpu_id; } #else /* !SMP */ static inline void smp_send_stop(void) { } /* * These macros fold the SMP functionality into a single CPU system */ #define raw_smp_processor_id() 0 static inline void up_smp_call_function(smp_call_func_t func, void *info) { } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) static inline void call_function_init(void) { } static inline int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { return smp_call_function_single(0, func, info, wait); } static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } #ifdef CONFIG_UP_LATE_INIT extern void __init up_late_init(void); static inline void smp_init(void) { up_late_init(); } #else static inline void smp_init(void) { } #endif static inline int get_boot_cpu_id(void) { return 0; } #endif /* !SMP */ /** * raw_processor_id() - get the current (unstable) CPU id * * For then you know what you are doing and need an unstable * CPU id. */ /** * smp_processor_id() - get the current (stable) CPU id * * This is the normal accessor to the CPU id and should be used * whenever possible. * * The CPU id is stable when: * * - IRQs are disabled; * - preemption is disabled; * - the task is CPU affine. * * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN * when smp_processor_id() is used when the CPU id is not stable. */ /* * Allow the architecture to differentiate between a stable and unstable read. * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a * regular asm read for the stable. */ #ifndef __smp_processor_id #define __smp_processor_id(x) raw_smp_processor_id(x) #endif #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else # define smp_processor_id() __smp_processor_id() #endif #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: */ extern void arch_disable_smp_support(void); extern void arch_thaw_secondary_cpus_begin(void); extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys); /* SMP core functions */ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); #endif /* __LINUX_SMP_H */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Logitech RumblePad and Rumblepad 2 * * Copyright (c) 2008 Anssi Hannula <anssi.hannula@gmail.com> */ /* */ #include <linux/input.h> #include <linux/slab.h> #include <linux/hid.h> #include "hid-lg.h" struct lg2ff_device { struct hid_report *report; }; static int play_effect(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct lg2ff_device *lg2ff = data; int weak, strong; strong = effect->u.rumble.strong_magnitude; weak = effect->u.rumble.weak_magnitude; if (weak || strong) { weak = weak * 0xff / 0xffff; strong = strong * 0xff / 0xffff; lg2ff->report->field[0]->value[0] = 0x51; lg2ff->report->field[0]->value[2] = weak; lg2ff->report->field[0]->value[4] = strong; } else { lg2ff->report->field[0]->value[0] = 0xf3; lg2ff->report->field[0]->value[2] = 0x00; lg2ff->report->field[0]->value[4] = 0x00; } hid_hw_request(hid, lg2ff->report, HID_REQ_SET_REPORT); return 0; } int lg2ff_init(struct hid_device *hid) { struct lg2ff_device *lg2ff; struct hid_report *report; struct hid_input *hidinput; struct input_dev *dev; int error; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hid->inputs.next, struct hid_input, list); dev = hidinput->input; /* Check that the report looks ok */ report = hid_validate_values(hid, HID_OUTPUT_REPORT, 0, 0, 7); if (!report) return -ENODEV; lg2ff = kmalloc(sizeof(struct lg2ff_device), GFP_KERNEL); if (!lg2ff) return -ENOMEM; set_bit(FF_RUMBLE, dev->ffbit); error = input_ff_create_memless(dev, lg2ff, play_effect); if (error) { kfree(lg2ff); return error; } lg2ff->report = report; report->field[0]->value[0] = 0xf3; report->field[0]->value[1] = 0x00; report->field[0]->value[2] = 0x00; report->field[0]->value[3] = 0x00; report->field[0]->value[4] = 0x00; report->field[0]->value[5] = 0x00; report->field[0]->value[6] = 0x00; hid_hw_request(hid, report, HID_REQ_SET_REPORT); hid_info(hid, "Force feedback for Logitech variant 2 rumble devices by Anssi Hannula <anssi.hannula@gmail.com>\n"); return 0; } |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. * Chen Liqin <liqin.chen@sunplusct.com> * Lennox Wu <lennox.wu@sunplusct.com> * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive */ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/tick.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/processor.h> #include <asm/csr.h> #include <asm/stacktrace.h> #include <asm/string.h> #include <asm/switch_to.h> #include <asm/thread_info.h> #include <asm/cpuidle.h> #include <asm/vector.h> register unsigned long gp_in_global __asm__("gp"); #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include <linux/stackprotector.h> unsigned long __stack_chk_guard __read_mostly; EXPORT_SYMBOL(__stack_chk_guard); #endif extern asmlinkage void ret_from_fork(void); void arch_cpu_idle(void) { cpu_do_idle(); } void __show_regs(struct pt_regs *regs) { show_regs_print_info(KERN_DEFAULT); if (!user_mode(regs)) { pr_cont("epc : %pS\n", (void *)regs->epc); pr_cont(" ra : %pS\n", (void *)regs->ra); } pr_cont("epc : " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n", regs->epc, regs->ra, regs->sp); pr_cont(" gp : " REG_FMT " tp : " REG_FMT " t0 : " REG_FMT "\n", regs->gp, regs->tp, regs->t0); pr_cont(" t1 : " REG_FMT " t2 : " REG_FMT " s0 : " REG_FMT "\n", regs->t1, regs->t2, regs->s0); pr_cont(" s1 : " REG_FMT " a0 : " REG_FMT " a1 : " REG_FMT "\n", regs->s1, regs->a0, regs->a1); pr_cont(" a2 : " REG_FMT " a3 : " REG_FMT " a4 : " REG_FMT "\n", regs->a2, regs->a3, regs->a4); pr_cont(" a5 : " REG_FMT " a6 : " REG_FMT " a7 : " REG_FMT "\n", regs->a5, regs->a6, regs->a7); pr_cont(" s2 : " REG_FMT " s3 : " REG_FMT " s4 : " REG_FMT "\n", regs->s2, regs->s3, regs->s4); pr_cont(" s5 : " REG_FMT " s6 : " REG_FMT " s7 : " REG_FMT "\n", regs->s5, regs->s6, regs->s7); pr_cont(" s8 : " REG_FMT " s9 : " REG_FMT " s10: " REG_FMT "\n", regs->s8, regs->s9, regs->s10); pr_cont(" s11: " REG_FMT " t3 : " REG_FMT " t4 : " REG_FMT "\n", regs->s11, regs->t3, regs->t4); pr_cont(" t5 : " REG_FMT " t6 : " REG_FMT "\n", regs->t5, regs->t6); pr_cont("status: " REG_FMT " badaddr: " REG_FMT " cause: " REG_FMT "\n", regs->status, regs->badaddr, regs->cause); } void show_regs(struct pt_regs *regs) { __show_regs(regs); if (!user_mode(regs)) dump_backtrace(regs, NULL, KERN_DEFAULT); } #ifdef CONFIG_COMPAT static bool compat_mode_supported __read_mostly; bool compat_elf_check_arch(Elf32_Ehdr *hdr) { return compat_mode_supported && hdr->e_machine == EM_RISCV && hdr->e_ident[EI_CLASS] == ELFCLASS32; } static int __init compat_mode_detect(void) { unsigned long tmp = csr_read(CSR_STATUS); csr_write(CSR_STATUS, (tmp & ~SR_UXL) | SR_UXL_32); compat_mode_supported = (csr_read(CSR_STATUS) & SR_UXL) == SR_UXL_32; csr_write(CSR_STATUS, tmp); pr_info("riscv: ELF compat mode %s", compat_mode_supported ? "supported" : "unsupported"); return 0; } early_initcall(compat_mode_detect); #endif void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { regs->status = SR_PIE; if (has_fpu()) { regs->status |= SR_FS_INITIAL; /* * Restore the initial value to the FP register * before starting the user program. */ fstate_restore(current, regs); } regs->epc = pc; regs->sp = sp; #ifdef CONFIG_64BIT regs->status &= ~SR_UXL; if (is_compat_task()) regs->status |= SR_UXL_32; else regs->status |= SR_UXL_64; #endif } void flush_thread(void) { #ifdef CONFIG_FPU /* * Reset FPU state and context * frm: round to nearest, ties to even (IEEE default) * fflags: accrued exceptions cleared */ fstate_off(current, task_pt_regs(current)); memset(¤t->thread.fstate, 0, sizeof(current->thread.fstate)); #endif #ifdef CONFIG_RISCV_ISA_V /* Reset vector state */ riscv_v_vstate_ctrl_init(current); riscv_v_vstate_off(task_pt_regs(current)); kfree(current->thread.vstate.datap); memset(¤t->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); #endif } void arch_release_task_struct(struct task_struct *tsk) { /* Free the vector context of datap. */ if (has_vector()) kfree(tsk->thread.vstate.datap); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { fstate_save(src, task_pt_regs(src)); *dst = *src; /* clear entire V context, including datap for a new task */ memset(&dst->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); return 0; } int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); memset(&p->thread.s, 0, sizeof(p->thread.s)); /* p->thread holds context to be restored by __switch_to() */ if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp_in_global; /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; p->thread.s[0] = (unsigned long)args->fn; p->thread.s[1] = (unsigned long)args->fn_arg; } else { *childregs = *(current_pt_regs()); /* Turn off status.VS */ riscv_v_vstate_off(childregs); if (usp) /* User fork */ childregs->sp = usp; if (clone_flags & CLONE_SETTLS) childregs->tp = tls; childregs->a0 = 0; /* Return value of fork() */ p->thread.s[0] = 0; } p->thread.ra = (unsigned long)ret_from_fork; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; } |
| 3894 135 4286 646 4275 4287 646 136 1 3892 2098 3427 3951 3950 3951 2653 1565 3894 3893 3893 3328 247 125 3894 3892 3894 3894 3542 3427 3427 3894 2912 2912 2910 2912 2912 2912 2912 2913 2913 2913 2909 2912 145 138 145 92 2911 3894 3894 3894 232 232 97 97 136 136 136 2662 2662 2172 2662 137 137 136 135 2 2 3890 3542 3894 3894 3893 3890 3893 3893 3336 3314 3314 72 3314 3880 3894 3893 3894 3894 3894 3892 3894 3886 3886 3886 997 996 997 997 224 915 997 225 987 225 102 102 102 102 102 102 225 226 76 76 64 75 108 108 108 108 108 1 108 108 108 108 107 108 108 108 108 108 108 108 108 108 1 1 108 1 1 1 1 1 108 97 97 97 97 97 96 97 1 1 97 97 97 1649 316 59 311 55 59 59 12 12 302 1647 1649 1649 1649 1649 196 1566 1565 1 1 1 1566 1566 1566 1566 1374 15 15 15 2098 2098 2098 2098 2098 2098 318 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 228 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 229 226 226 226 229 229 229 229 229 229 229 229 229 229 229 229 229 229 97 97 97 97 97 94 97 97 97 97 97 97 97 97 97 |