| 1 9 9 8 1 5 4 2 2 2 1 1 2 2 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,mark type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support */ /* 2 skbinfo support */ #define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,mark"); /* Type specific function prefix */ #define HTYPE hash_ipmark #define IP_SET_HASH_WITH_MARKMASK /* IPv4 variant */ /* Member elements */ struct hash_ipmark4_elem { __be32 ip; __u32 mark; }; /* Common functions */ static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->mark == ip2->mark; } static bool hash_ipmark4_data_list(struct sk_buff *skb, const struct hash_ipmark4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { next->ip = d->ip; } #define MTYPE hash_ipmark4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.mark = skb->mark; e.mark &= h->markmask; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (e.mark == 0 && e.ip == 0) return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) { if (e.mark == 0 && ip_to == 0) return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ipmark4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } /* IPv6 variant */ struct hash_ipmark6_elem { union nf_inet_addr ip; __u32 mark; }; /* Common functions */ static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->mark == ip2->mark; } static bool hash_ipmark6_data_list(struct sk_buff *skb, const struct hash_ipmark6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipmark6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipmark6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.mark = skb->mark; e.mark &= h->markmask; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipmark6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { .name = "hash:ip,mark", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmark_create, .create_policy = { [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_MARK] = { .type = NLA_U32 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipmark_init(void) { return ip_set_type_register(&hash_ipmark_type); } static void __exit hash_ipmark_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } module_init(hash_ipmark_init); module_exit(hash_ipmark_fini); |
| 11 11 8 11 1 5 5 3 3 1 1 1 1 1 1 1 1 1 308 8 2 11 6 6 4 8 45 16 8 17 72 4 3 39 84 28 60 14 8 22 2 3 2 3 43 1 1 1 1 6 6 6 6 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> #include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> #include <linux/bpf_verifier.h> #include <linux/uaccess.h> #include <linux/verification.h> #include <linux/task_work.h> #include <linux/irq_work.h> #include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock * if program is allowed to access maps, so check rcu_read_lock_held() or * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_PTR_TO_MAP_VALUE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) { return map->ops->map_push_elem(map, value, flags); } const struct bpf_func_proto bpf_map_push_elem_proto = { .func = bpf_map_push_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) { return map->ops->map_pop_elem(map, value); } const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) { return map->ops->map_peek_elem(map, value); } const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { .func = bpf_map_lookup_percpu_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, .allow_fastcall = true, }; BPF_CALL_0(bpf_get_numa_node_id) { return numa_node_id(); } const struct bpf_func_proto bpf_get_numa_node_id_proto = { .func = bpf_get_numa_node_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_boot_ns) { /* NMI safe access to clock boottime */ return ktime_get_boot_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .func = bpf_ktime_get_boot_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_coarse_ns) { return ktime_get_coarse_ns(); } const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .func = bpf_ktime_get_coarse_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_tai_ns) { /* NMI safe access to clock tai */ return ktime_get_tai_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { .func = bpf_ktime_get_tai_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; } const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .func = bpf_get_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; kgid_t gid; if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .func = bpf_get_current_uid_gid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; if (unlikely(!task)) goto err_clear; /* Verifier guarantees that size > 0 */ strscpy_pad(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; union { __u32 val; arch_spinlock_t lock; } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); preempt_disable(); arch_spin_lock(l); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; arch_spin_unlock(l); preempt_enable(); } #else static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); do { atomic_cond_read_relaxed(l, !VAL); } while (atomic_xchg(l, 1)); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; atomic_set_release(l, 0); } #endif static DEFINE_PER_CPU(unsigned long, irqsave_flags); static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); } NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; } const struct bpf_func_proto bpf_spin_lock_proto = { .func = bpf_spin_lock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); } NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; } const struct bpf_func_proto bpf_spin_unlock_proto = { .func = bpf_spin_unlock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src) { struct bpf_spin_lock *lock; if (lock_src) lock = src + map->record->spin_lock_off; else lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } BPF_CALL_0(bpf_jiffies64) { return get_jiffies_64(); } const struct bpf_func_proto bpf_jiffies64_proto = { .func = bpf_jiffies64, .gpl_only = false, .ret_type = RET_INTEGER, }; #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); cgrp_id = cgroup_id(cgrp); rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .func = bpf_get_current_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { struct cgroup *cgrp; struct cgroup *ancestor; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); cgrp_id = ancestor ? cgroup_id(ancestor) : 0; rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .func = bpf_get_current_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; #endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, unsigned long long *res, bool *is_negative) { unsigned int base = flags & BPF_STRTOX_BASE_MASK; const char *cur_buf = buf; size_t cur_len = buf_len; unsigned int consumed; size_t val_len; char str[64]; if (!buf || !buf_len || !res || !is_negative) return -EINVAL; if (base != 0 && base != 8 && base != 10 && base != 16) return -EINVAL; if (flags & ~BPF_STRTOX_BASE_MASK) return -EINVAL; while (cur_buf < buf + buf_len && isspace(*cur_buf)) ++cur_buf; *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); if (*is_negative) ++cur_buf; consumed = cur_buf - buf; cur_len -= consumed; if (!cur_len) return -EINVAL; cur_len = min(cur_len, sizeof(str) - 1); memcpy(str, cur_buf, cur_len); str[cur_len] = '\0'; cur_buf = str; cur_buf = _parse_integer_fixup_radix(cur_buf, &base); val_len = _parse_integer(cur_buf, base, res); if (val_len & KSTRTOX_OVERFLOW) return -ERANGE; if (val_len == 0) return -EINVAL; cur_buf += val_len; consumed += cur_buf - str; return consumed; } static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, long long *res) { unsigned long long _res; bool is_negative; int err; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) { if ((long long)-_res > 0) return -ERANGE; *res = -_res; } else { if ((long long)_res < 0) return -ERANGE; *res = _res; } return err; } BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, s64 *, res) { long long _res; int err; *res = 0; err = __bpf_strtoll(buf, buf_len, flags, &_res); if (err < 0) return err; *res = _res; return err; } const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, u64 *, res) { unsigned long long _res; bool is_negative; int err; *res = 0; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) return -EINVAL; *res = _res; return err; } const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { return strncmp(s1, s2, s1_sz); } static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_CONST_STR, }; BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { struct task_struct *task = current; struct pid_namespace *pidns; int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_pidns_info))) goto clear; if (unlikely((u64)(dev_t)dev != dev)) goto clear; if (unlikely(!task)) goto clear; pidns = task_active_pid_ns(task); if (unlikely(!pidns)) { err = -ENOENT; goto clear; } if (!ns_match(&pidns->ns, (dev_t)dev, ino)) goto clear; nsdata->pid = task_pid_nr_ns(task, pidns); nsdata->tgid = task_tgid_nr_ns(task, pidns); return 0; clear: memset((void *)nsdata, 0, (size_t) size); return err; } const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .func = bpf_get_ns_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, void *, data, u64, size) { if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const void __user *, user_ptr) { int ret = copy_from_user(dst, user_ptr, size); if (unlikely(ret)) { memset(dst, 0, size); ret = -EFAULT; } return ret; } const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) { int ret; /* flags is not used yet */ if (unlikely(flags)) return -EINVAL; if (unlikely(!size)) return 0; ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); if (ret == size) return 0; memset(dst, 0, size); /* Return -EFAULT for partial read */ return ret < 0 ? ret : -EFAULT; } const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_BTF_ID, .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg5_type = ARG_ANYTHING }; BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) return (unsigned long)NULL; return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu); } const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) { return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr); } const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, size_t bufsz) { void __user *user_ptr = (__force void __user *)unsafe_ptr; buf[0] = 0; switch (fmt_ptype) { case 's': #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)unsafe_ptr < TASK_SIZE) return strncpy_from_user_nofault(buf, user_ptr, bufsz); fallthrough; #endif case 'k': return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); case 'u': return strncpy_from_user_nofault(buf, user_ptr, bufsz); } return -EINVAL; } /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); return 0; } void bpf_put_buffers(void) { if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) { if (!data->bin_args && !data->buf) return; bpf_put_buffers(); } /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: * - Format string verification only: when data->get_bin_args is false * - Arguments preparation: in addition to the above verification, it writes in * data->bin_args a binary representation of arguments usable by bstr_printf * where pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; struct bpf_bprintf_buffers *buffers = NULL; size_t sizeof_cur_arg, sizeof_cur_ip; int err, i, num_spec = 0; u64 cur_arg; char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; fmt_end = strnchr(fmt, fmt_size, 0); if (!fmt_end) return -EINVAL; fmt_size = fmt_end - fmt; if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { if (num_args) tmp_buf = buffers->bin_args; tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; data->bin_args = (u32 *)tmp_buf; } if (data->get_buf) data->buf = buffers->buf; for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; goto out; } if (fmt[i] != '%') continue; if (fmt[i + 1] == '%') { i++; continue; } if (num_spec >= num_args) { err = -EINVAL; goto out; } /* The string is zero-terminated so if fmt[i] != 0, we can * always access fmt[i + 1], in the worst case it will be a 0 */ i++; /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || fmt[i] == ' ') i++; if (fmt[i] >= '1' && fmt[i] <= '9') { i++; while (fmt[i] >= '0' && fmt[i] <= '9') i++; } if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || ispunct(fmt[i + 1])) { if (tmp_buf) cur_arg = raw_args[num_spec]; goto nocopy_fmt; } if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; i += 2; goto fmt_str; } if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { if (tmp_buf) cur_arg = raw_args[num_spec]; i++; goto nocopy_fmt; } if (fmt[i + 1] == 'B') { if (tmp_buf) { err = snprintf(tmp_buf, (tmp_buf_end - tmp_buf), "%pB", (void *)(long)raw_args[num_spec]); tmp_buf += (err + 1); } i++; num_spec++; continue; } /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { err = -EINVAL; goto out; } i += 2; if (!tmp_buf) goto nocopy_fmt; sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, sizeof_cur_ip); if (err < 0) memset(cur_ip, 0, sizeof_cur_ip); /* hack: bstr_printf expects IP addresses to be * pre-formatted as strings, ironically, the easiest way * to do that is to call snprintf. */ ip_spec[2] = fmt[i - 1]; ip_spec[3] = fmt[i]; err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, ip_spec, &cur_ip); tmp_buf += err + 1; num_spec++; continue; } else if (fmt[i] == 's') { fmt_ptype = fmt[i]; fmt_str: if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) { err = -EINVAL; goto out; } if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, fmt_ptype, tmp_buf_end - tmp_buf); if (err < 0) { tmp_buf[0] = '\0'; err = 1; } tmp_buf += err; num_spec++; continue; } else if (fmt[i] == 'c') { if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } *tmp_buf = raw_args[num_spec]; tmp_buf++; num_spec++; continue; } sizeof_cur_arg = sizeof(int); if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long); i++; } if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long long); i++; } if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x' && fmt[i] != 'X') { err = -EINVAL; goto out; } if (tmp_buf) cur_arg = raw_args[num_spec]; nocopy_fmt: if (tmp_buf) { tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { err = -ENOSPC; goto out; } if (sizeof_cur_arg == 8) { *(u32 *)tmp_buf = *(u32 *)&cur_arg; *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); } else { *(u32 *)tmp_buf = (u32)(long)cur_arg; } tmp_buf += sizeof_cur_arg; } num_spec++; } err = 0; out: if (err) bpf_bprintf_cleanup(data); return err; } BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, }; int err, num_args; if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data); if (err < 0) return err; err = bstr_printf(str, str_size, fmt, data.bin_args); bpf_bprintf_cleanup(&data); return err + 1; } const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) { if (map->map_type == BPF_MAP_TYPE_ARRAY) { struct bpf_array *array = container_of(map, struct bpf_array, map); *arr_idx = ((char *)value - array->value) / array->elem_size; return arr_idx; } return (void *)value - round_up(map->key_size, 8); } struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; union { struct rcu_head rcu; struct work_struct delete_work; }; u64 flags; }; /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. * bpf_timer_start() arms the timer. * If user space reference to a map goes to zero at this point * ops->map_release_uref callback is responsible for cancelling the timers, * freeing their memory, and decrementing prog's refcnts. * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. * Inner maps can contain bpf timers as well. ops->map_release_uref is * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; atomic_t cancelling; }; struct bpf_work { struct bpf_async_cb cb; struct work_struct work; struct work_struct delete_work; }; /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ struct bpf_async_kern { union { struct bpf_async_cb *cb; struct bpf_hrtimer *timer; struct bpf_work *work; }; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; } __attribute__((aligned(8))); enum bpf_async_type { BPF_ASYNC_TYPE_TIMER = 0, BPF_ASYNC_TYPE_WQ, }; static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); struct bpf_map *map = t->cb.map; void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and * cannot be preempted by another bpf_timer_cb() on the same cpu. * Remember the timer this callback is servicing to prevent * deadlock if callback_fn() calls bpf_timer_cancel() or * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ this_cpu_write(hrtimer_running, NULL); out: return HRTIMER_NORESTART; } static void bpf_wq_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, work); struct bpf_async_cb *cb = &w->cb; struct bpf_map *map = cb->map; bpf_callback_t callback_fn; void *value = cb->value; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_wq); callback_fn = READ_ONCE(cb->callback_fn); if (!callback_fn) return; key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); migrate_enable(); rcu_read_unlock_trace(); } static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); kfree_rcu(w, cb.rcu); } static void bpf_timer_delete_work(struct work_struct *work) { struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call * kfree_rcu(t) right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); kfree_rcu(t, cb.rcu); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { struct bpf_async_cb *cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: size = sizeof(struct bpf_hrtimer); break; case BPF_ASYNC_TYPE_WQ: size = sizeof(struct bpf_work); break; default: return -EINVAL; } __bpf_spin_lock_irqsave(&async->lock); t = async->timer; if (t) { ret = -EBUSY; goto out; } /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until * kmalloc_nolock() is available, avoid locking issues by using * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). */ cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; } switch (type) { case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: w = (struct bpf_work *)cb; INIT_WORK(&w->work, bpf_wq_work); INIT_WORK(&w->delete_work, bpf_wq_delete_work); cb->value = (void *)async - map->record->wq_off; break; } cb->map = map; cb->prog = NULL; cb->flags = flags; rcu_assign_pointer(cb->callback_fn, NULL); WRITE_ONCE(async->cb, cb); /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. */ smp_mb(); if (!atomic64_read(&map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); kfree(cb); ret = -EPERM; } out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, u64, flags) { clock_t clockid = flags & (MAX_CLOCKS - 1); BUILD_BUG_ON(MAX_CLOCKS != 16); BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); if (flags >= MAX_CLOCKS || /* similar to timerfd except _ALARM variants are not supported */ (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_BOOTTIME)) return -EINVAL; return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); } static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog_aux *aux, unsigned int flags, enum bpf_async_type type) { struct bpf_prog *prev, *prog = aux->prog; struct bpf_async_cb *cb; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; __bpf_spin_lock_irqsave(&async->lock); cb = async->cb; if (!cb) { ret = -EINVAL; goto out; } if (!atomic64_read(&cb->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space * is gone, since map_release_uref won't ever be called. */ ret = -EPERM; goto out; } prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. */ prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto out; } if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); cb->prog = prog; } rcu_assign_pointer(cb->callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_PTR_TO_FUNC, }; BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; enum hrtimer_mode mode; if (in_nmi()) return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t || !t->cb.prog) { ret = -EINVAL; goto out; } if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; else mode = HRTIMER_MODE_REL_SOFT; if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; } static const struct bpf_func_proto bpf_timer_start_proto = { .func = bpf_timer_start, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static void drop_prog_refcnt(struct bpf_async_cb *async) { struct bpf_prog *prog = async->prog; if (prog) { bpf_prog_put(prog); async->prog = NULL; rcu_assign_pointer(async->callback_fn, NULL); } } BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t, *cur_t; bool inc = false; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; rcu_read_lock(); __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { ret = -EINVAL; goto out; } cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ ret = -EDEADLK; goto out; } /* Only account in-flight cancellations when invoked from a timer * callback, since we want to avoid waiting only if other _callbacks_ * are waiting on us, to avoid introducing lockups. Non-callback paths * are ok, since nobody would synchronously wait for their completion. */ if (!cur_t) goto drop; atomic_inc(&t->cancelling); /* Need full barrier after relaxed atomic_inc */ smp_mb__after_atomic(); inc = true; if (atomic_read(&cur_t->cancelling)) { /* We're cancelling timer t, while some other timer callback is * attempting to cancel us. In such a case, it might be possible * that timer t belongs to the other callback, or some other * callback waiting upon it (creating transitive dependencies * upon us), and we will enter a deadlock if we continue * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ ret = -EDEADLK; goto out; } drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } static const struct bpf_func_proto bpf_timer_cancel_proto = { .func = bpf_timer_cancel, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, }; static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; /* Performance optimization: read async->cb without lock first. */ if (!READ_ONCE(async->cb)) return NULL; __bpf_spin_lock_irqsave(&async->lock); /* re-read it under lock */ cb = async->cb; if (!cb) goto out; drop_prog_refcnt(cb); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ WRITE_ONCE(async->cb, NULL); out: __bpf_spin_unlock_irqrestore(&async->lock); return cb; } /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { struct bpf_hrtimer *t; t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); if (!t) return; /* We check that bpf_map_delete/update_elem() was called from timer * callback_fn. In such case we don't call hrtimer_cancel() (since it * will deadlock) and don't call hrtimer_try_to_cancel() (since it will * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. * * However, it is possible the timer callback_fn calling us armed the * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do * kfree_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a * situation where both sides run in parallel, try to cancel one * another, and we end up waiting on both sides in hrtimer_cancel * without making forward progress, since timer1 depends on time2 * callback to finish, and vice versa. * * CPU 1 (timer1_cb) CPU 2 (timer2_cb) * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) * * To avoid these issues, punt to workqueue context when we are in a * timer callback. */ if (this_cpu_read(hrtimer_running)) { queue_work(system_dfl_wq, &t->cb.delete_work); return; } if (IS_ENABLED(CONFIG_PREEMPT_RT)) { /* If the timer is running on other CPU, also use a kworker to * wait for the completion of the timer instead of trying to * acquire a sleepable lock in hrtimer_cancel() to wait for its * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) kfree_rcu(t, cb.rcu); else queue_work(system_dfl_wq, &t->cb.delete_work); } else { bpf_timer_delete_work(&t->cb.delete_work); } } /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_wq_cancel_and_free(void *val) { struct bpf_work *work; BTF_TYPE_EMIT(struct bpf_wq); work = (struct bpf_work *)__bpf_async_cancel_and_free(val); if (!work) return; /* Trigger cancel of the sleepable work, but *do not* wait for * it to finish if it was running as we might not be in a * sleepable context. * kfree will be called once the work has finished. */ schedule_work(&work->delete_work); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) { unsigned long *kptr = dst; /* This helper may be inlined by verifier. */ return xchg(kptr, (unsigned long)ptr); } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to * denote type that verifier will determine. */ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, .arg1_type = ARG_KPTR_XCHG_DEST, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ #define DYNPTR_MAX_SIZE ((1UL << 24) - 1) #define DYNPTR_TYPE_SHIFT 28 #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { ptr->size |= DYNPTR_RDONLY_BIT; } static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) { return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; ptr->size = new_size | metadata; } int bpf_dynptr_check_size(u32 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { ptr->data = data; ptr->offset = offset; ptr->size = size; bpf_dynptr_set_type(ptr, type); } void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { memset(ptr, 0, sizeof(*ptr)); } BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; BTF_TYPE_EMIT(struct bpf_dynptr); err = bpf_dynptr_check_size(size); if (err) goto error; /* flags is currently unsupported */ if (flags) { err = -EINVAL; goto error; } bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); return 0; error: bpf_dynptr_set_null(ptr); return err; } static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); if (err) return err; type = bpf_dynptr_get_type(src); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst, src->data + src->offset + offset, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, u32 len, u64 flags) { enum bpf_dynptr_type type; int err; if (!dst->data || __bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); if (err) return err; type = bpf_dynptr_get_type(dst); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: if (flags) return -EINVAL; /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst->data + dst->offset + offset, src, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, flags); case BPF_DYNPTR_TYPE_XDP: if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: if (flags) return -EINVAL; memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { enum bpf_dynptr_type type; int err; if (!ptr->data) return 0; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return 0; if (__bpf_dynptr_is_rdonly(ptr)) return 0; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); return 0; } } static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto bpf_perf_event_read_proto __weak; const struct bpf_func_proto bpf_send_signal_proto __weak; const struct bpf_func_proto bpf_send_signal_thread_proto __weak; const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; const struct bpf_func_proto bpf_get_task_stack_proto __weak; const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; case BPF_FUNC_map_update_elem: return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; case BPF_FUNC_map_push_elem: return &bpf_map_push_elem_proto; case BPF_FUNC_map_pop_elem: return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; case BPF_FUNC_map_lookup_percpu_elem: return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_get_numa_node_id: return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ktime_get_tai_ns: return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: return &bpf_ringbuf_reserve_proto; case BPF_FUNC_ringbuf_submit: return &bpf_ringbuf_submit_proto; case BPF_FUNC_ringbuf_discard: return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; case BPF_FUNC_strtol: return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; default: break; } if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return NULL; switch (func_id) { case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; case BPF_FUNC_timer_init: return &bpf_timer_init_proto; case BPF_FUNC_timer_set_callback: return &bpf_timer_set_callback_proto; case BPF_FUNC_timer_start: return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_user_ringbuf_drain: return &bpf_user_ringbuf_drain_proto; case BPF_FUNC_ringbuf_reserve_dynptr: return &bpf_ringbuf_reserve_dynptr_proto; case BPF_FUNC_ringbuf_submit_dynptr: return &bpf_ringbuf_submit_dynptr_proto; case BPF_FUNC_ringbuf_discard_dynptr: return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; case BPF_FUNC_dynptr_read: return &bpf_dynptr_read_proto; case BPF_FUNC_dynptr_write: return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_cgrp_storage_get: return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: if (bpf_prog_check_recur(prog)) return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: if (bpf_prog_check_recur(prog)) return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; default: break; } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_copy_from_user: return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; case BPF_FUNC_task_pt_regs: return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); case BPF_FUNC_perf_event_read_value: return bpf_get_perf_event_read_value_proto(); case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_send_signal: return &bpf_send_signal_proto; case BPF_FUNC_send_signal_thread: return &bpf_send_signal_thread_proto; case BPF_FUNC_get_task_stack: return prog->sleepable ? &bpf_get_task_stack_sleepable_proto : &bpf_get_task_stack_proto; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; case BPF_FUNC_find_vma: return &bpf_find_vma_proto; default: return NULL; } } EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { struct list_head *head = list_head, *orig_head = list_head; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up * executing on entry/exit of functions called inside the critical * section, and end up doing map ops that call bpf_list_head_free for * the same map value again. */ __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; head = head->next; unlock: INIT_LIST_HEAD(orig_head); __bpf_spin_unlock_irqrestore(spin_lock); while (head != orig_head) { void *obj = head; obj -= field->graph_root.node_offset; head = head->next; /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are * 'rb_node *', so field name of rb_node within containing struct is not * needed. * * Since bpf_rb_tree's node type has a corresponding struct btf_field with * graph_root.node_offset, it's not necessary to know field name * or type of node struct */ #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ for (pos = rb_first_postorder(root); \ pos && ({ n = rb_next_postorder(pos); 1; }); \ pos = n) void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; struct rb_node *pos, *n; void *obj; BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; u64 size = local_type_id__k; void *p; p = bpf_mem_alloc(&bpf_global_ma, size); if (!p) return NULL; if (meta) bpf_obj_init(meta->record, p); return p; } __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) { u64 size = local_type_id__k; /* The verifier has ensured that meta__ign must be NULL */ return bpf_mem_alloc(&bpf_global_percpu_ma, size); } /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { struct bpf_mem_alloc *ma; if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 * refcount. Return without freeing the object */ return; } if (rec) bpf_obj_free_fields(rec, p); if (percpu) ma = &bpf_global_percpu_ma; else ma = &bpf_global_ma; bpf_mem_free_rcu(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) { /* The verifier has ensured that meta__ign must be NULL */ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; struct bpf_refcount *ref; /* Could just cast directly to refcount_t *, but need some code using * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); if (!refcount_inc_not_zero((refcount_t *)ref)) return NULL; /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null * in verifier.c */ return (void *)p__refcounted_kptr; } static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); WRITE_ONCE(node->owner, head); return 0; } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { return __bpf_list_del(head, false); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { return __bpf_list_del(head, true); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; if (list_empty(h) || unlikely(!h->next)) return NULL; return (struct bpf_list_node *)h->next; } __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; if (list_empty(h) || unlikely(!h->next)) return NULL; return (struct bpf_list_node *)h->prev; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; struct rb_node *n = &node_internal->rb_node; /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) */ if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } while (*link) { parent = *link; if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); WRITE_ONCE(node->owner, root); return 0; } __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), void *meta__ign, u64 off) { struct btf_struct_meta *meta = meta__ign; struct bpf_rb_node_kern *n = (void *)node; return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)rb_first_cached(r); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)r->rb_root.rb_node; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; if (READ_ONCE(node_internal->owner) != root) return NULL; return (struct bpf_rb_node *)node_internal->rb_node.rb_left; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; if (READ_ONCE(node_internal->owner) != root) return NULL; return (struct bpf_rb_node *)node_internal->rb_node.rb_right; } /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling * bpf_task_release(). * @p: The task on which a reference is being acquired. */ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) { if (refcount_inc_not_zero(&p->rcu_users)) return p; return NULL; } /** * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { put_task_struct_rcu_user(p); } __bpf_kfunc void bpf_task_release_dtor(void *p) { put_task_struct_rcu_user(p); } CFI_NOSEAL(bpf_task_release_dtor); #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by * this kfunc which is not stored in a map as a kptr, must be released by * calling bpf_cgroup_release(). * @cgrp: The cgroup on which a reference is being acquired. */ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) { return cgroup_tryget(cgrp) ? cgrp : NULL; } /** * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to * not be freed until the current grace period has ended, even if its refcount * drops to 0. * @cgrp: The cgroup on which a reference is being released. */ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) { cgroup_put(cgrp); } __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) { cgroup_put(cgrp); } CFI_NOSEAL(bpf_cgroup_release_dtor); /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a * map, must be released by calling bpf_cgroup_release(). * @cgrp: The cgroup for which we're performing a lookup. * @level: The level of ancestor to look up. */ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) { struct cgroup *ancestor; if (level > cgrp->level || level < 0) return NULL; /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */ ancestor = cgrp->ancestors[level]; if (!cgroup_tryget(ancestor)) return NULL; return ancestor; } /** * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this * kfunc which is not subsequently stored in a map, must be released by calling * bpf_cgroup_release(). * @cgid: cgroup id. */ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; } /** * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test * task's membership of cgroup ancestry. * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { long ret; rcu_read_lock(); ret = task_under_cgroup_hierarchy(task, ancestor); rcu_read_unlock(); return ret; } BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return task_under_cgroup_hierarchy(current, cgrp); } const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .func = bpf_current_task_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @task: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returen. On failure, NULL is returned. */ __bpf_kfunc struct cgroup * bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) { struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); if (IS_ERR(cgrp)) return NULL; return cgrp; } #endif /* CONFIG_CGROUPS */ /** * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up * in the root pid namespace idr. If a task is returned, it must either be * stored in a map, or released with bpf_task_release(). * @pid: The pid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up * in the pid namespace of the current task. If a task is returned, it must * either be stored in a map, or released with bpf_task_release(). * @vpid: The vpid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__opt is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. * * The user must check that the returned pointer is not null before using it. * * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, void *buffer__opt, u32 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; u32 len = buffer__szk; int err; if (!ptr->data) return NULL; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return NULL; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: if (buffer__opt) return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; if (!buffer__opt) return NULL; bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; } } /** * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__opt is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct * data pointer to (example: the requested slice is to the paged area of an skb * packet). In the case where the returned pointer is to the buffer, the user * is responsible for persisting writes through calling bpf_dynptr_write(). This * usually looks something like this pattern: * * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); * if (!eth) * return TC_ACT_SHOT; * * // mutate eth header // * * if (eth == buffer) * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); * * Please note that, as in the example above, the user must check that the * returned pointer is not null before using it. * * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, void *buffer__opt, u32 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the * returned pointer is directly to skb data, and if the skb is cloned, the * verifier will have uncloned it (see bpf_unclone_prologue()) already. * The pointer can be directly written into. * * 2) Some portion of the requested slice is in the paged buffer area. * In this case, the requested data will be copied out into the buffer * and the returned pointer will be a pointer to the buffer. The skb * will not be pulled. To persist the write, the user will need to call * bpf_dynptr_write(), which will pull the skb and commit the write. * * Similarly for xdp programs, if the requested slice is not across xdp * fragments, then a direct pointer will be returned, otherwise the data * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u32 size; if (!ptr->data || start > end) return -EINVAL; size = __bpf_dynptr_size(ptr); if (start > size || end > size) return -ERANGE; ptr->offset += start; bpf_dynptr_set_size(ptr, end - start); return 0; } __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; return __bpf_dynptr_is_rdonly(ptr); } __bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; return __bpf_dynptr_size(ptr); } __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); return -EINVAL; } *clone = *ptr; return 0; } /** * bpf_dynptr_copy() - Copy data from one dynptr to another. * @dst_ptr: Destination dynptr - where data should be copied to * @dst_off: Offset into the destination dynptr * @src_ptr: Source dynptr - where data should be copied from * @src_off: Offset into the source dynptr * @size: Length of the data to copy from source to destination * * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, struct bpf_dynptr *src_ptr, u32 src_off, u32 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u32 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); if (src_slice && dst_slice) { memmove(dst_slice, src_slice, size); return 0; } if (src_slice) return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); if (dst_slice) return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); if (bpf_dynptr_check_off_len(dst, dst_off, size) || bpf_dynptr_check_off_len(src, src_off, size)) return -E2BIG; off = 0; while (off < size) { u32 chunk_sz = min_t(u32, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); if (err) return err; err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); if (err) return err; off += chunk_sz; } return 0; } /** * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. * @p: Destination dynptr - where data will be filled * @offset: Offset into the dynptr to start filling from * @size: Number of bytes to fill * @val: Constant byte to fill the memory with * * Fills the @size bytes of the memory area pointed to by @p * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u32 chunk_sz, write_off; char buf[256]; void* slice; int err; slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); if (likely(slice)) { memset(slice, val, size); return 0; } if (__bpf_dynptr_is_rdonly(ptr)) return -EINVAL; err = bpf_dynptr_check_off_len(ptr, offset, size); if (err) return err; /* Non-linear data under the dynptr, write from a local buffer */ chunk_sz = min_t(u32, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { chunk_sz = min_t(u32, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; } return 0; } __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; } __bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k) { return (void *)obj__ign; } __bpf_kfunc void bpf_rcu_read_lock(void) { rcu_read_lock(); } __bpf_kfunc void bpf_rcu_read_unlock(void) { rcu_read_unlock(); } struct bpf_throw_ctx { struct bpf_prog_aux *aux; u64 sp; u64 bp; int cnt; }; static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) { struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; /* * The RCU read lock is held to safely traverse the latch tree, but we * don't need its protection when accessing the prog, since it has an * active stack frame on the current stack trace, and won't disappear. */ rcu_read_lock(); prog = bpf_prog_ksym_find(ip); rcu_read_unlock(); if (!prog) return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; ctx->aux = prog->aux; ctx->sp = sp; ctx->bp = bp; return false; } __bpf_kfunc void bpf_throw(u64 cookie) { struct bpf_throw_ctx ctx = {}; arch_bpf_stack_walk(bpf_stack_walker, &ctx); WARN_ON_ONCE(!ctx.aux); if (ctx.aux) WARN_ON_ONCE(!ctx.aux->exception_boundary); WARN_ON_ONCE(!ctx.bp); WARN_ON_ONCE(!ctx.cnt); /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning * deeper stack depths than ctx.sp as we do not return from bpf_throw, * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_map *map = p__map; BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); if (flags) return -EINVAL; return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; if (in_nmi()) return -EOPNOTSUPP; if (flags) return -EINVAL; w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; schedule_work(&w->work); return 0; } __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, void *aux__prog) { struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) return -EINVAL; return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); } __bpf_kfunc void bpf_preempt_disable(void) { preempt_disable(); } __bpf_kfunc void bpf_preempt_enable(void) { preempt_enable(); } struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); #define BITS_ITER_NR_WORDS_MAX 511 struct bpf_iter_bits_kern { union { __u64 *bits; __u64 bits_copy; }; int nr_bits; int bit; } __aligned(8); /* On 64-bit hosts, unsigned long and u64 have the same size, so passing * a u64 pointer and an unsigned long pointer to find_next_bit() will * return the same result, as both point to the same 8-byte area. * * For 32-bit little-endian hosts, using a u64 pointer or unsigned long * pointer also makes no difference. This is because the first iterated * unsigned long is composed of bits 0-31 of the u64 and the second unsigned * long is composed of bits 32-63 of the u64. * * However, for 32-bit big-endian hosts, this is not the case. The first * iterated unsigned long will be bits 32-63 of the u64, so swap these two * ulong values within the u64. */ static void swap_ulong_in_u64(u64 *bits, unsigned int nr) { #if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) unsigned int i; for (i = 0; i < nr; i++) bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); #endif } /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It * copies the data of the memory area to the newly created bpf_iter_bits @it for * subsequent iteration operations. * * On success, 0 is returned. On failure, ERR is returned. */ __bpf_kfunc int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) { struct bpf_iter_bits_kern *kit = (void *)it; u32 nr_bytes = nr_words * sizeof(u64); u32 nr_bits = BYTES_TO_BITS(nr_bytes); int err; BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits)); BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) != __alignof__(struct bpf_iter_bits)); kit->nr_bits = 0; kit->bits_copy = 0; kit->bit = -1; if (!unsafe_ptr__ign || !nr_words) return -EINVAL; if (nr_words > BITS_ITER_NR_WORDS_MAX) return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign); if (err) return -EFAULT; swap_ulong_in_u64(&kit->bits_copy, nr_words); kit->nr_bits = nr_bits; return 0; } if (bpf_mem_alloc_check_size(false, nr_bytes)) return -E2BIG; /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) return -ENOMEM; err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign); if (err) { bpf_mem_free(&bpf_global_ma, kit->bits); return err; } swap_ulong_in_u64(kit->bits, nr_words); kit->nr_bits = nr_bits; return 0; } /** * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits * @it: The bpf_iter_bits to be checked * * This function returns a pointer to a number representing the value of the * next bit in the bits. * * If there are no further bits available, it returns NULL. */ __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; int bit = kit->bit, nr_bits = kit->nr_bits; const void *bits; if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { kit->bit = bit; return NULL; } kit->bit = bit; return &kit->bit; } /** * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits * @it: The bpf_iter_bits to be destroyed * * Destroy the resource associated with the bpf_iter_bits. */ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; if (kit->nr_bits <= 64) return; bpf_mem_free(&bpf_global_ma, kit->bits); } /** * bpf_copy_from_user_str() - Copy a string from an unsafe user address * @dst: Destination address, in kernel space. This buffer must be * at least @dst__sz bytes long. * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. * @unsafe_ptr__ign: Source address, in user space. * @flags: The only supported flag is BPF_F_PAD_ZEROS * * Copies a NUL-terminated string from userspace to BPF space. If user string is * too long this will still ensure zero termination in the dst buffer unless * buffer size is 0. * * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and * memset all of @dst on failure. */ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) { int ret; if (unlikely(flags & ~BPF_F_PAD_ZEROS)) return -EINVAL; if (unlikely(!dst__sz)) return 0; ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); if (ret < 0) { if (flags & BPF_F_PAD_ZEROS) memset((char *)dst, 0, dst__sz); return ret; } if (flags & BPF_F_PAD_ZEROS) memset((char *)dst + ret, 0, dst__sz - ret); else ((char *)dst)[ret] = '\0'; return ret + 1; } /** * bpf_copy_from_user_task_str() - Copy a string from an task's address space * @dst: Destination address, in kernel space. This buffer must be * at least @dst__sz bytes long. * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. * @unsafe_ptr__ign: Source address in the task's address space. * @tsk: The task whose address space will be used * @flags: The only supported flag is BPF_F_PAD_ZEROS * * Copies a NUL terminated string from a task's address space to @dst__sz * buffer. If user string is too long this will still ensure zero termination * in the @dst__sz buffer unless buffer size is 0. * * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success * and memset all of @dst__sz on failure. * * Return: The number of copied bytes on success including the NUL terminator. * A negative error code on failure. */ __bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, struct task_struct *tsk, u64 flags) { int ret; if (unlikely(flags & ~BPF_F_PAD_ZEROS)) return -EINVAL; if (unlikely(dst__sz == 0)) return 0; ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); if (ret < 0) { if (flags & BPF_F_PAD_ZEROS) memset(dst, 0, dst__sz); return ret; } if (flags & BPF_F_PAD_ZEROS) memset(dst + ret, 0, dst__sz - ret); return ret + 1; } /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only * read and write the 4-bytes on 32-bit. */ __bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag) { local_irq_save(*flags__irq_flag); } __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) { local_irq_restore(*flags__irq_flag); } __bpf_kfunc void __bpf_trap(void) { } /* * Kfuncs for string operations. * * Since strings are not necessarily %NUL-terminated, we cannot directly call * in-kernel implementations. Instead, we open-code the implementations using * __get_kernel_nofault instead of plain dereference to make them safe. */ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) { char c1, c2; int i; if (!copy_from_kernel_nofault_allowed(s1, 1) || !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c1, s1, char, err_out); __get_kernel_nofault(&c2, s2, char, err_out); if (ignore_case) { c1 = tolower(c1); c2 = tolower(c2); } if (c1 != c2) return c1 < c2 ? -1 : 1; if (c1 == '\0') return 0; s1++; s2++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strcmp - Compare two strings * @s1__ign: One string * @s2__ign: Another string * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) { return __bpf_strcasecmp(s1__ign, s2__ign, false); } /** * bpf_strcasecmp - Compare two strings, ignoring the case of the characters * @s1__ign: One string * @s2__ign: Another string * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) { return __bpf_strcasecmp(s1__ign, s2__ign, true); } /** * bpf_strnchr - Find a character in a length limited string * @s__ign: The string to be searched * @count: The number of characters to be searched * @c: The character to search for * * Note that the %NUL-terminator is considered part of the string, and can * be searched for. * * Return: * * >=0 - Index of the first occurrence of @c within @s__ign * * %-ENOENT - @c not found in the first @count characters of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) { char sc; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == c) return i; if (sc == '\0') return -ENOENT; s__ign++; } return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; err_out: return -EFAULT; } /** * bpf_strchr - Find the first occurrence of a character in a string * @s__ign: The string to be searched * @c: The character to search for * * Note that the %NUL-terminator is considered part of the string, and can * be searched for. * * Return: * * >=0 - The index of the first occurrence of @c within @s__ign * * %-ENOENT - @c not found in @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strchr(const char *s__ign, char c) { return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); } /** * bpf_strchrnul - Find and return a character in a string, or end of string * @s__ign: The string to be searched * @c: The character to search for * * Return: * * >=0 - Index of the first occurrence of @c within @s__ign or index of * the null byte at the end of @s__ign when @c is not found * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) { char sc; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == '\0' || sc == c) return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strrchr - Find the last occurrence of a character in a string * @s__ign: The string to be searched * @c: The character to search for * * Return: * * >=0 - Index of the last occurrence of @c within @s__ign * * %-ENOENT - @c not found in @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strrchr(const char *s__ign, int c) { char sc; int i, last = -ENOENT; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == c) last = i; if (sc == '\0') return last; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strnlen - Calculate the length of a length-limited string * @s__ign: The string * @count: The maximum number of characters to count * * Return: * * >=0 - The length of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) { char c; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c, s__ign, char, err_out); if (c == '\0') return i; s__ign++; } return i == XATTR_SIZE_MAX ? -E2BIG : i; err_out: return -EFAULT; } /** * bpf_strlen - Calculate the length of a string * @s__ign: The string * * Return: * * >=0 - The length of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strlen(const char *s__ign) { return bpf_strnlen(s__ign, XATTR_SIZE_MAX); } /** * bpf_strspn - Calculate the length of the initial substring of @s__ign which * only contains letters in @accept__ign * @s__ign: The string to be searched * @accept__ign: The string to search for * * Return: * * >=0 - The length of the initial substring of @s__ign which only * contains letters from @accept__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) { char cs, ca; int i, j; if (!copy_from_kernel_nofault_allowed(s__ign, 1) || !copy_from_kernel_nofault_allowed(accept__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&cs, s__ign, char, err_out); if (cs == '\0') return i; for (j = 0; j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&ca, accept__ign + j, char, err_out); if (cs == ca || ca == '\0') break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (ca == '\0') return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strcspn - Calculate the length of the initial substring of @s__ign which * does not contain letters in @reject__ign * @s__ign: The string to be searched * @reject__ign: The string to search for * * Return: * * >=0 - The length of the initial substring of @s__ign which does not * contain letters from @reject__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) { char cs, cr; int i, j; if (!copy_from_kernel_nofault_allowed(s__ign, 1) || !copy_from_kernel_nofault_allowed(reject__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&cs, s__ign, char, err_out); if (cs == '\0') return i; for (j = 0; j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&cr, reject__ign + j, char, err_out); if (cs == cr || cr == '\0') break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (cr != '\0') return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strnstr - Find the first substring in a length-limited string * @s1__ign: The string to be searched * @s2__ign: The string to search for * @len: the maximum number of characters to search * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within the first @len characters of @s1__ign * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) { char c1, c2; int i, j; if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || !copy_from_kernel_nofault_allowed(s2__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; /* * We allow reading an extra byte from s2 (note the * `i + j <= len` above) to cover the case when s2 is * a suffix of the first len chars of s1. */ if (i + j == len) break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; if (c1 != c2) break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (i + j == len) return -ENOENT; s1__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strstr - Find the first substring in a string * @s1__ign: The string to be searched * @s2__ign: The string to search for * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within @s1__ign * * %-ENOENT - @s2__ign is not a substring of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } #ifdef CONFIG_KEYS /** * bpf_lookup_user_key - lookup a key by its serial * @serial: key handle serial number * @flags: lookup-specific flags * * Search a key with a given *serial* and the provided *flags*. * If found, increment the reference count of the key by one, and * return it in the bpf_key structure. * * The bpf_key structure must be passed to bpf_key_put() when done * with it, so that the key reference count is decremented and the * bpf_key structure is freed. * * Permission checks are deferred to the time the key is used by * one of the available key-specific kfuncs. * * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested * special keyring (e.g. session keyring), if it doesn't yet exist. * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting * for the key construction, and to retrieve uninstantiated keys (keys * without data attached to them). * * Return: a bpf_key pointer with a valid key pointer if the key is found, a * NULL pointer otherwise. */ __bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) { key_ref_t key_ref; struct bpf_key *bkey; if (flags & ~KEY_LOOKUP_ALL) return NULL; /* * Permission check is deferred until the key is used, as the * intent of the caller is unknown here. */ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); if (IS_ERR(key_ref)) return NULL; bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); if (!bkey) { key_put(key_ref_to_ptr(key_ref)); return NULL; } bkey->key = key_ref_to_ptr(key_ref); bkey->has_ref = true; return bkey; } /** * bpf_lookup_system_key - lookup a key by a system-defined ID * @id: key ID * * Obtain a bpf_key structure with a key pointer set to the passed key ID. * The key pointer is marked as invalid, to prevent bpf_key_put() from * attempting to decrement the key reference count on that pointer. The key * pointer set in such way is currently understood only by * verify_pkcs7_signature(). * * Set *id* to one of the values defined in include/linux/verification.h: * 0 for the primary keyring (immutable keyring of system keys); * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring * (where keys can be added only if they are vouched for by existing keys * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform * keyring (primarily used by the integrity subsystem to verify a kexec'ed * kerned image and, possibly, the initramfs signature). * * Return: a bpf_key pointer with an invalid key pointer set from the * pre-determined ID on success, a NULL pointer otherwise */ __bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) { struct bpf_key *bkey; if (system_keyring_id_check(id) < 0) return NULL; bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); if (!bkey) return NULL; bkey->key = (struct key *)(unsigned long)id; bkey->has_ref = false; return bkey; } /** * bpf_key_put - decrement key reference count if key is valid and free bpf_key * @bkey: bpf_key structure * * Decrement the reference count of the key inside *bkey*, if the pointer * is valid, and free *bkey*. */ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) { if (bkey->has_ref) key_put(bkey->key); kfree(bkey); } /** * bpf_verify_pkcs7_signature - verify a PKCS#7 signature * @data_p: data to verify * @sig_p: signature of the data * @trusted_keyring: keyring with keys trusted for signature verification * * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* * with keys in a keyring referenced by *trusted_keyring*. * * Return: 0 on success, a negative value on error. */ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { /* * Do the permission check deferred in bpf_lookup_user_key(). * See bpf_lookup_user_key() for more details. * * A call to key_task_permission() here would be redundant, as * it is already done by keyring_search() called by * find_asymmetric_key(). */ ret = key_validate(trusted_keyring->key); if (ret < 0) return ret; } data_len = __bpf_dynptr_size(data_ptr); data = __bpf_dynptr_data(data_ptr, data_len); sig_len = __bpf_dynptr_size(sig_ptr); sig = __bpf_dynptr_data(sig_ptr, sig_len); return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ } #endif /* CONFIG_KEYS */ typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); enum bpf_task_work_state { /* bpf_task_work is ready to be used */ BPF_TW_STANDBY = 0, /* irq work scheduling in progress */ BPF_TW_PENDING, /* task work scheduling in progress */ BPF_TW_SCHEDULING, /* task work is scheduled successfully */ BPF_TW_SCHEDULED, /* callback is running */ BPF_TW_RUNNING, /* associated BPF map value is deleted */ BPF_TW_FREED, }; struct bpf_task_work_ctx { enum bpf_task_work_state state; refcount_t refcnt; struct callback_head work; struct irq_work irq_work; /* bpf_prog that schedules task work */ struct bpf_prog *prog; /* task for which callback is scheduled */ struct task_struct *task; /* the map and map value associated with this context */ struct bpf_map *map; void *map_val; enum task_work_notify_mode mode; bpf_task_work_callback_t callback_fn; struct rcu_head rcu; } __aligned(8); /* Actual type for struct bpf_task_work */ struct bpf_task_work_kern { struct bpf_task_work_ctx *ctx; }; static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) { if (ctx->prog) { bpf_prog_put(ctx->prog); ctx->prog = NULL; } if (ctx->task) { bpf_task_release(ctx->task); ctx->task = NULL; } } static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) { return refcount_inc_not_zero(&ctx->refcnt); } static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) { if (!refcount_dec_and_test(&ctx->refcnt)) return; bpf_task_work_ctx_reset(ctx); /* bpf_mem_free expects migration to be disabled */ migrate_disable(); bpf_mem_free(&bpf_global_ma, ctx); migrate_enable(); } static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) { /* * Scheduled task_work callback holds ctx ref, so if we successfully * cancelled, we put that ref on callback's behalf. If we couldn't * cancel, callback will inevitably run or has already completed * running, and it would have taken care of its ctx ref itself. */ if (task_work_cancel(ctx->task, &ctx->work)) bpf_task_work_ctx_put(ctx); } static void bpf_task_work_callback(struct callback_head *cb) { struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); enum bpf_task_work_state state; u32 idx; void *key; /* Read lock is needed to protect ctx and map key/value access */ guard(rcu_tasks_trace)(); /* * This callback may start running before bpf_task_work_irq() switched to * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); if (state == BPF_TW_SCHEDULED) state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); if (state == BPF_TW_FREED) { bpf_task_work_ctx_put(ctx); return; } key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); migrate_disable(); ctx->callback_fn(ctx->map, key, ctx->map_val); migrate_enable(); bpf_task_work_ctx_reset(ctx); (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); bpf_task_work_ctx_put(ctx); } static void bpf_task_work_irq(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); enum bpf_task_work_state state; int err; guard(rcu_tasks_trace)(); if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { bpf_task_work_ctx_put(ctx); return; } err = task_work_add(ctx->task, &ctx->work, ctx->mode); if (err) { bpf_task_work_ctx_reset(ctx); /* * try to switch back to STANDBY for another task_work reuse, but we might have * gone to FREED already, which is fine as we already cleaned up after ourselves */ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); bpf_task_work_ctx_put(ctx); return; } /* * It's technically possible for just scheduled task_work callback to * complete running by now, going SCHEDULING -> RUNNING and then * dropping its ctx refcount. Instead of capturing extra ref just to * protected below ctx->state access, we rely on RCU protection to * perform below SCHEDULING -> SCHEDULED attempt. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); if (state == BPF_TW_FREED) bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ } static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, struct bpf_map *map) { struct bpf_task_work_kern *twk = (void *)tw; struct bpf_task_work_ctx *ctx, *old_ctx; ctx = READ_ONCE(twk->ctx); if (ctx) return ctx; ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); if (!ctx) return ERR_PTR(-ENOMEM); memset(ctx, 0, sizeof(*ctx)); refcount_set(&ctx->refcnt, 1); /* map's own ref */ ctx->state = BPF_TW_STANDBY; old_ctx = cmpxchg(&twk->ctx, NULL, ctx); if (old_ctx) { /* * tw->ctx is set by concurrent BPF program, release allocated * memory and try to reuse already set context. */ bpf_mem_free(&bpf_global_ma, ctx); return old_ctx; } return ctx; /* Success */ } static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, struct bpf_map *map) { struct bpf_task_work_ctx *ctx; ctx = bpf_task_work_fetch_ctx(tw, map); if (IS_ERR(ctx)) return ctx; /* try to get ref for task_work callback to hold */ if (!bpf_task_work_ctx_tryget(ctx)) return ERR_PTR(-EBUSY); if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ bpf_task_work_ctx_put(ctx); return ERR_PTR(-EBUSY); } /* * If no process or bpffs is holding a reference to the map, no new callbacks should be * scheduled. This does not address any race or correctness issue, but rather is a policy * choice: dropping user references should stop everything. */ if (!atomic64_read(&map->usercnt)) { /* drop ref we just got for task_work callback itself */ bpf_task_work_ctx_put(ctx); /* transfer map's ref into cancel_and_free() */ bpf_task_work_cancel_and_free(tw); return ERR_PTR(-EBUSY); } return ctx; } static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, struct bpf_map *map, bpf_task_work_callback_t callback_fn, struct bpf_prog_aux *aux, enum task_work_notify_mode mode) { struct bpf_prog *prog; struct bpf_task_work_ctx *ctx; int err; BTF_TYPE_EMIT(struct bpf_task_work); prog = bpf_prog_inc_not_zero(aux->prog); if (IS_ERR(prog)) return -EBADF; task = bpf_task_acquire(task); if (!task) { err = -EBADF; goto release_prog; } ctx = bpf_task_work_acquire_ctx(tw, map); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto release_all; } ctx->task = task; ctx->callback_fn = callback_fn; ctx->prog = prog; ctx->mode = mode; ctx->map = map; ctx->map_val = (void *)tw - map->record->task_work_off; init_task_work(&ctx->work, bpf_task_work_callback); init_irq_work(&ctx->irq_work, bpf_task_work_irq); irq_work_queue(&ctx->irq_work); return 0; release_all: bpf_task_release(task); release_prog: bpf_prog_put(prog); return err; } /** * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call * @aux__prog: user should pass NULL * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, void *map__map, bpf_task_work_callback_t callback, void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call * @aux__prog: user should pass NULL * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, void *map__map, bpf_task_work_callback_t callback, void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ } void bpf_task_work_cancel_and_free(void *val) { struct bpf_task_work_kern *twk = val; struct bpf_task_work_ctx *ctx; enum bpf_task_work_state state; ctx = xchg(&twk->ctx, NULL); if (!ctx) return; state = xchg(&ctx->state, BPF_TW_FREED); if (state == BPF_TW_SCHEDULED) { /* run in irq_work to avoid locks in NMI */ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); irq_work_queue(&ctx->irq_work); return; } bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ } BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif #endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, .set = &generic_btf_ids, }; BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_KFUNCS_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); BTF_ID_FLAGS(func, bpf_strcasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); BTF_ID_FLAGS(func, bpf_strrchr); BTF_ID_FLAGS(func, bpf_strlen); BTF_ID_FLAGS(func, bpf_strnlen); BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, .set = &common_btf_ids, }; static int __init kfunc_init(void) { int ret; const struct btf_id_dtor_kfunc generic_dtors[] = { { .btf_id = generic_dtor_ids[0], .kfunc_btf_id = generic_dtor_ids[1] }, #ifdef CONFIG_CGROUPS { .btf_id = generic_dtor_ids[2], .kfunc_btf_id = generic_dtor_ids[3] }, #endif }; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; return bpf_dynptr_slice(p, 0, NULL, len); } /* Get a pointer to dynptr data up to len bytes for read write access. If * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } |
| 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_DISK_H #define _SCSI_DISK_H /* * More than enough for everybody ;) The huge number of majors * is a leftover from 16bit dev_t days, we don't really need that * much numberspace. */ #define SD_MAJORS 16 /* * Time out in seconds for disks and Magneto-opticals (which are slower). */ #define SD_TIMEOUT (30 * HZ) #define SD_MOD_TIMEOUT (75 * HZ) /* * Flush timeout is a multiplier over the standard device timeout which is * user modifiable via sysfs but initially set to SD_TIMEOUT */ #define SD_FLUSH_TIMEOUT_MULTIPLIER 2 #define SD_WRITE_SAME_TIMEOUT (120 * HZ) /* * Number of allowed retries */ #define SD_MAX_RETRIES 5 #define SD_PASSTHROUGH_RETRIES 1 #define SD_MAX_MEDIUM_TIMEOUTS 2 /* * Size of the initial data buffer for mode and read capacity data */ #define SD_BUF_SIZE 512 /* * Number of sectors at the end of the device to avoid multi-sector * accesses to in the case of last_sector_bug */ #define SD_LAST_BUGGY_SECTORS 8 enum { SD_EXT_CDB_SIZE = 32, /* Extended CDB size */ SD_MEMPOOL_SIZE = 2, /* CDB pool size */ }; enum { SD_DEF_XFER_BLOCKS = 0xffff, SD_MAX_XFER_BLOCKS = 0xffffffff, SD_MAX_WS10_BLOCKS = 0xffff, SD_MAX_WS16_BLOCKS = 0x7fffff, }; enum { SD_LBP_FULL = 0, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ SD_LBP_ZERO, /* Use WRITE SAME(10) with zero payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; enum { SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; /** * struct zoned_disk_info - Specific properties of a ZBC SCSI device. * @nr_zones: number of zones. * @zone_blocks: number of logical blocks per zone. * * This data structure holds the ZBC SCSI device properties that are retrieved * twice: a first time before the gendisk capacity is known and a second time * after the gendisk capacity is known. */ struct zoned_disk_info { u32 nr_zones; u32 zone_blocks; }; struct scsi_disk { struct scsi_device *device; /* * disk_dev is used to show attributes in /sys/class/scsi_disk/, * but otherwise not really needed. Do not use for refcounting. */ struct device disk_dev; struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED /* Updated during revalidation before the gendisk capacity is known. */ struct zoned_disk_info early_zone_info; /* Updated during revalidation after the gendisk capacity is known. */ struct zoned_disk_info zone_info; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; /* * Either zero or a power of two. If not zero it means that the offset * between zone starting LBAs is constant. */ u32 zone_starting_lba_gran; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ int max_retries; u32 min_xfer_blocks; u32 max_xfer_blocks; u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; u32 max_atomic; u32 atomic_alignment; u32 atomic_granularity; u32 max_atomic_with_boundary; u32 max_atomic_boundary; u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; /* number of permanent streams */ u16 permanent_stream_count; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ u8 provisioning_mode; u8 zeroing_mode; u8 nr_actuators; /* Number of actuators */ bool suspended; /* Disk is suspended (stopped) */ unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ unsigned RCD : 1; /* state of disk RCD bit, unused */ unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ unsigned first_scan : 1; unsigned lbpme : 1; unsigned lbprz : 1; unsigned lbpu : 1; unsigned lbpws : 1; unsigned lbpws10 : 1; unsigned lbpvpd : 1; unsigned ws10 : 1; unsigned ws16 : 1; unsigned rc_basis: 2; unsigned zoned: 2; unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; unsigned rscs : 1; /* reduced stream control support */ unsigned use_atomic_write_boundary : 1; }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) static inline struct scsi_disk *scsi_disk(struct gendisk *disk) { return disk->private_data; } #define sd_printk(prefix, sdsk, fmt, a...) \ (sdsk)->disk ? \ sdev_prefix_printk(prefix, (sdsk)->device, \ (sdsk)->disk->disk_name, fmt, ##a) : \ sdev_printk(prefix, (sdsk)->device, fmt, ##a) #define sd_first_printk(prefix, sdsk, fmt, a...) \ do { \ if ((sdsk)->first_scan) \ sd_printk(prefix, sdsk, fmt, ##a); \ } while (0) static inline int scsi_medium_access_command(struct scsi_cmnd *scmd) { switch (scmd->cmnd[0]) { case READ_6: case READ_10: case READ_12: case READ_16: case SYNCHRONIZE_CACHE: case VERIFY: case VERIFY_12: case VERIFY_16: case WRITE_6: case WRITE_10: case WRITE_12: case WRITE_16: case WRITE_SAME: case WRITE_SAME_16: case UNMAP: return 1; case VARIABLE_LENGTH_CMD: switch (scmd->cmnd[9]) { case READ_32: case VERIFY_32: case WRITE_32: case WRITE_SAME_32: return 1; } } return 0; } static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks) { return blocks << (ilog2(sdev->sector_size) - 9); } static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks) { return blocks * sdev->sector_size; } static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes) { return bytes >> ilog2(sdev->sector_size); } static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector) { return sector >> (ilog2(sdev->sector_size) - 9); } void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); #ifdef CONFIG_BLK_DEV_ZONED int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); #else /* CONFIG_BLK_DEV_ZONED */ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { return 0; } static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { return 0; } static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all) { return BLK_STS_TARGET; } static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { return good_bytes; } #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr); void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result); #endif /* _SCSI_DISK_H */ |
| 1 1 1 1 5 3 1 1 1 46 43 5 43 3 41 2 3 1 31 6 4 25 18 4 3 20 5 2 2 20 13 6 6 25 12 14 14 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 | // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <net/ip.h> #include <net/tcp.h> #include <net/netlink.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_synproxy.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_synproxy.h> struct nft_synproxy { struct nf_synproxy_info info; }; static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, }; static void nft_synproxy_tcp_options(struct synproxy_options *opts, const struct tcphdr *tcp, struct synproxy_net *snet, struct nf_synproxy_info *info, const struct nft_synproxy *priv) { this_cpu_inc(snet->stats->syn_received); if (tcp->ece && tcp->cwr) opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; opts->mss_encode = opts->mss_option; opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM | NF_SYNPROXY_OPT_ECN); } static void nft_synproxy_eval_v4(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; if (tcp->syn) { /* Initial SYN from client */ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); synproxy_send_client_synack(net, skb, tcp, opts); consume_skb(skb); regs->verdict.code = NF_STOLEN; } else if (tcp->ack) { /* ACK from client */ if (synproxy_recv_client_ack(net, skb, tcp, opts, ntohl(tcp->seq))) { consume_skb(skb); regs->verdict.code = NF_STOLEN; } else { regs->verdict.code = NF_DROP; } } } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) static void nft_synproxy_eval_v6(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; if (tcp->syn) { /* Initial SYN from client */ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); synproxy_send_client_synack_ipv6(net, skb, tcp, opts); consume_skb(skb); regs->verdict.code = NF_STOLEN; } else if (tcp->ack) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, ntohl(tcp->seq))) { consume_skb(skb); regs->verdict.code = NF_STOLEN; } else { regs->verdict.code = NF_DROP; } } } #endif /* CONFIG_NF_TABLES_IPV6*/ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; if (pkt->tprot != IPPROTO_TCP) { regs->verdict.code = NFT_BREAK; return; } if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { regs->verdict.code = NF_DROP; return; } tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { regs->verdict.code = NF_DROP; return; } if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { regs->verdict.code = NF_DROP; return; } switch (skb->protocol) { case htons(ETH_P_IP): nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts); return; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case htons(ETH_P_IPV6): nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts); return; #endif } regs->verdict.code = NFT_BREAK; } static int nft_synproxy_do_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_synproxy *priv) { struct synproxy_net *snet = synproxy_pernet(ctx->net); u32 flags; int err; if (tb[NFTA_SYNPROXY_MSS]) priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); if (tb[NFTA_SYNPROXY_WSCALE]) priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); if (tb[NFTA_SYNPROXY_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); if (flags & ~NF_SYNPROXY_OPT_MASK) return -EOPNOTSUPP; priv->info.options = flags; } err = nf_ct_netns_get(ctx->net, ctx->family); if (err) return err; switch (ctx->family) { case NFPROTO_IPV4: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: err = nf_synproxy_ipv6_init(snet, ctx->net); if (err) goto nf_ct_failure; break; #endif case NFPROTO_INET: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; err = nf_synproxy_ipv6_init(snet, ctx->net); if (err) { nf_synproxy_ipv4_fini(snet, ctx->net); goto nf_ct_failure; } break; } return 0; nf_ct_failure: nf_ct_netns_put(ctx->net, ctx->family); return err; } static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) { struct synproxy_net *snet = synproxy_pernet(ctx->net); switch (ctx->family) { case NFPROTO_IPV4: nf_synproxy_ipv4_fini(snet, ctx->net); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: nf_synproxy_ipv6_fini(snet, ctx->net); break; #endif case NFPROTO_INET: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; } nf_ct_netns_put(ctx->net, ctx->family); } static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv) { if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_synproxy_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_synproxy *priv = nft_expr_priv(expr); nft_synproxy_do_eval(priv, regs, pkt); } static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } static int nft_synproxy_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_synproxy *priv = nft_expr_priv(expr); return nft_synproxy_do_init(ctx, tb, priv); } static void nft_synproxy_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nft_synproxy_do_destroy(ctx); } static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_synproxy *priv = nft_expr_priv(expr); return nft_synproxy_do_dump(skb, priv); } static struct nft_expr_type nft_synproxy_type; static const struct nft_expr_ops nft_synproxy_ops = { .eval = nft_synproxy_eval, .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), .init = nft_synproxy_init, .destroy = nft_synproxy_destroy, .dump = nft_synproxy_dump, .type = &nft_synproxy_type, .validate = nft_synproxy_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_synproxy_type __read_mostly = { .ops = &nft_synproxy_ops, .name = "synproxy", .owner = THIS_MODULE, .policy = nft_synproxy_policy, .maxattr = NFTA_SYNPROXY_MAX, }; static int nft_synproxy_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_synproxy *priv = nft_obj_data(obj); return nft_synproxy_do_init(ctx, tb, priv); } static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nft_synproxy_do_destroy(ctx); } static int nft_synproxy_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { struct nft_synproxy *priv = nft_obj_data(obj); return nft_synproxy_do_dump(skb, priv); } static void nft_synproxy_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_synproxy *priv = nft_obj_data(obj); nft_synproxy_do_eval(priv, regs, pkt); } static void nft_synproxy_obj_update(struct nft_object *obj, struct nft_object *newobj) { struct nft_synproxy *newpriv = nft_obj_data(newobj); struct nft_synproxy *priv = nft_obj_data(obj); priv->info = newpriv->info; } static struct nft_object_type nft_synproxy_obj_type; static const struct nft_object_ops nft_synproxy_obj_ops = { .type = &nft_synproxy_obj_type, .size = sizeof(struct nft_synproxy), .init = nft_synproxy_obj_init, .destroy = nft_synproxy_obj_destroy, .dump = nft_synproxy_obj_dump, .eval = nft_synproxy_obj_eval, .update = nft_synproxy_obj_update, }; static struct nft_object_type nft_synproxy_obj_type __read_mostly = { .type = NFT_OBJECT_SYNPROXY, .ops = &nft_synproxy_obj_ops, .maxattr = NFTA_SYNPROXY_MAX, .policy = nft_synproxy_policy, .owner = THIS_MODULE, }; static int __init nft_synproxy_module_init(void) { int err; err = nft_register_obj(&nft_synproxy_obj_type); if (err < 0) return err; err = nft_register_expr(&nft_synproxy_type); if (err < 0) goto err; return 0; err: nft_unregister_obj(&nft_synproxy_obj_type); return err; } static void __exit nft_synproxy_module_exit(void) { nft_unregister_expr(&nft_synproxy_type); nft_unregister_obj(&nft_synproxy_obj_type); } module_init(nft_synproxy_module_init); module_exit(nft_synproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("synproxy"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); MODULE_DESCRIPTION("nftables SYNPROXY expression support"); |
| 8 10 7 8 8 2 6 39 1 1 37 1 1 2 2 2 2 2 2 4 1 1 2 1 1 83 1 1 1 2 2 1 1 1 1 2 7 2 1 1 1 1 1 4 4 1 1 4 4 4 4 4 4 4 4 138 138 102 48 4 6 5 1 1 1 1 1 1 1 1 1 1 1 1 4 4 1 1 1 1 1 4 2 33 1 16 8 7 1 1 5 133 134 40 17 7 77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 obz under the linux copyright * * Dynamic diacritical handling - aeb@cwi.nl - Dec 1993 * Dynamic keymap and string allocation - aeb@cwi.nl - May 1994 * Restrict VT switching via ioctl() - grif@cs.ucr.edu - Dec 1995 * Some code moved for less code duplication - Andi Kleen - Mar 1997 * Check put/get_user, cleanups - acme@conectiva.com.br - Jun 2001 */ #include <linux/types.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/tty.h> #include <linux/timer.h> #include <linux/kernel.h> #include <linux/compat.h> #include <linux/module.h> #include <linux/kd.h> #include <linux/vt.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/console.h> #include <linux/consolemap.h> #include <linux/signal.h> #include <linux/suspend.h> #include <linux/timex.h> #include <asm/io.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/kbd_diacr.h> #include <linux/selection.h> bool vt_dont_switch; static inline bool vt_in_use(unsigned int i) { const struct vc_data *vc = vc_cons[i].d; /* * console_lock must be held to prevent the vc from being deallocated * while we're checking whether it's in-use. */ WARN_CONSOLE_UNLOCKED(); return vc && kref_read(&vc->port.kref) > 1; } static inline bool vt_busy(int i) { if (vt_in_use(i)) return true; if (i == fg_console) return true; if (vc_is_sel(vc_cons[i].d)) return true; return false; } /* * Console (vt and kd) routines, as defined by USL SVR4 manual, and by * experimentation and study of X386 SYSV handling. * * One point of difference: SYSV vt's are /dev/vtX, which X >= 0, and * /dev/console is a separate ttyp. Under Linux, /dev/tty0 is /dev/console, * and the vc start at /dev/ttyX, X >= 1. We maintain that here, so we will * always treat our set of vt as numbered 1..MAX_NR_CONSOLES (corresponding to * ttys 0..MAX_NR_CONSOLES-1). Explicitly naming VT 0 is illegal, but using * /dev/tty0 (fg_console) as a target is legal, since an implicit aliasing * to the current console is done by the main ioctl code. */ #ifdef CONFIG_X86 #include <asm/syscalls.h> #endif static void complete_change_console(struct vc_data *vc); /* * User space VT_EVENT handlers */ struct vt_event_wait { struct list_head list; struct vt_event event; int done; }; static LIST_HEAD(vt_events); static DEFINE_SPINLOCK(vt_event_lock); static DECLARE_WAIT_QUEUE_HEAD(vt_event_waitqueue); /** * vt_event_post * @event: the event that occurred * @old: old console * @new: new console * * Post an VT event to interested VT handlers */ void vt_event_post(unsigned int event, unsigned int old, unsigned int new) { struct list_head *pos, *head; unsigned long flags; int wake = 0; spin_lock_irqsave(&vt_event_lock, flags); head = &vt_events; list_for_each(pos, head) { struct vt_event_wait *ve = list_entry(pos, struct vt_event_wait, list); if (!(ve->event.event & event)) continue; ve->event.event = event; /* kernel view is consoles 0..n-1, user space view is console 1..n with 0 meaning current, so we must bias */ ve->event.oldev = old + 1; ve->event.newev = new + 1; wake = 1; ve->done = 1; } spin_unlock_irqrestore(&vt_event_lock, flags); if (wake) wake_up_interruptible(&vt_event_waitqueue); } static void __vt_event_queue(struct vt_event_wait *vw) { unsigned long flags; /* Prepare the event */ INIT_LIST_HEAD(&vw->list); vw->done = 0; /* Queue our event */ spin_lock_irqsave(&vt_event_lock, flags); list_add(&vw->list, &vt_events); spin_unlock_irqrestore(&vt_event_lock, flags); } static void __vt_event_wait(struct vt_event_wait *vw) { /* Wait for it to pass */ wait_event_interruptible(vt_event_waitqueue, vw->done); } static void __vt_event_dequeue(struct vt_event_wait *vw) { unsigned long flags; /* Dequeue it */ spin_lock_irqsave(&vt_event_lock, flags); list_del(&vw->list); spin_unlock_irqrestore(&vt_event_lock, flags); } /** * vt_event_wait - wait for an event * @vw: our event * * Waits for an event to occur which completes our vt_event_wait * structure. On return the structure has wv->done set to 1 for success * or 0 if some event such as a signal ended the wait. */ static void vt_event_wait(struct vt_event_wait *vw) { __vt_event_queue(vw); __vt_event_wait(vw); __vt_event_dequeue(vw); } /** * vt_event_wait_ioctl - event ioctl handler * @event: argument to ioctl (the event) * * Implement the VT_WAITEVENT ioctl using the VT event interface */ static int vt_event_wait_ioctl(struct vt_event __user *event) { struct vt_event_wait vw; if (copy_from_user(&vw.event, event, sizeof(struct vt_event))) return -EFAULT; /* Highest supported event for now */ if (vw.event.event & ~VT_MAX_EVENT) return -EINVAL; vt_event_wait(&vw); /* If it occurred report it */ if (vw.done) { if (copy_to_user(event, &vw.event, sizeof(struct vt_event))) return -EFAULT; return 0; } return -EINTR; } /** * vt_waitactive - active console wait * @n: new console * * Helper for event waits. Used to implement the legacy * event waiting ioctls in terms of events */ int vt_waitactive(int n) { struct vt_event_wait vw; do { vw.event.event = VT_EVENT_SWITCH; __vt_event_queue(&vw); if (n == fg_console + 1) { __vt_event_dequeue(&vw); break; } __vt_event_wait(&vw); __vt_event_dequeue(&vw); if (vw.done == 0) return -EINTR; } while (vw.event.newev != n); return 0; } /* * these are the valid i/o ports we're allowed to change. they map all the * video ports */ #define GPFIRST 0x3b4 #define GPLAST 0x3df #define GPNUM (GPLAST - GPFIRST + 1) /* * currently, setting the mode from KD_TEXT to KD_GRAPHICS doesn't do a whole * lot. i'm not sure if it should do any restoration of modes or what... * * XXX It should at least call into the driver, fbdev's definitely need to * restore their engine state. --BenH * * Called with the console lock held. */ static int vt_kdsetmode(struct vc_data *vc, unsigned long mode) { switch (mode) { case KD_GRAPHICS: break; case KD_TEXT0: case KD_TEXT1: mode = KD_TEXT; fallthrough; case KD_TEXT: break; default: return -EINVAL; } if (vc->vc_mode == mode) return 0; vc->vc_mode = mode; if (vc->vc_num != fg_console) return 0; /* explicitly blank/unblank the screen if switching modes */ if (mode == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); return 0; } static int vt_k_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg, bool perm) { struct vc_data *vc = tty->driver_data; void __user *up = (void __user *)arg; unsigned int console = vc->vc_num; int ret; switch (cmd) { case KIOCSOUND: if (!perm) return -EPERM; /* * The use of PIT_TICK_RATE is historic, it used to be * the platform-dependent CLOCK_TICK_RATE between 2.6.12 * and 2.6.36, which was a minor but unfortunate ABI * change. kd_mksound is locked by the input layer. */ if (arg) arg = PIT_TICK_RATE / arg; kd_mksound(arg, 0); break; case KDMKTONE: if (!perm) return -EPERM; { unsigned int ticks, count; /* * Generate the tone for the appropriate number of ticks. * If the time is zero, turn off sound ourselves. */ ticks = msecs_to_jiffies((arg >> 16) & 0xffff); count = ticks ? (arg & 0xffff) : 0; if (count) count = PIT_TICK_RATE / count; kd_mksound(count, ticks); break; } case KDGKBTYPE: /* * this is naïve. */ return put_user(KB_101, (char __user *)arg); /* * These cannot be implemented on any machine that implements * ioperm() in user level (such as Alpha PCs) or not at all. * * XXX: you should never use these, just call ioperm directly.. */ #ifdef CONFIG_X86 case KDADDIO: case KDDELIO: /* * KDADDIO and KDDELIO may be able to add ports beyond what * we reject here, but to be safe... * * These are locked internally via sys_ioperm */ if (arg < GPFIRST || arg > GPLAST) return -EINVAL; return ksys_ioperm(arg, 1, (cmd == KDADDIO)) ? -ENXIO : 0; case KDENABIO: case KDDISABIO: return ksys_ioperm(GPFIRST, GPNUM, (cmd == KDENABIO)) ? -ENXIO : 0; #endif /* Linux m68k/i386 interface for setting the keyboard delay/repeat rate */ case KDKBDREP: { struct kbd_repeat kbrep; if (!capable(CAP_SYS_TTY_CONFIG)) return -EPERM; if (copy_from_user(&kbrep, up, sizeof(struct kbd_repeat))) return -EFAULT; ret = kbd_rate(&kbrep); if (ret) return ret; if (copy_to_user(up, &kbrep, sizeof(struct kbd_repeat))) return -EFAULT; break; } case KDSETMODE: { if (!perm) return -EPERM; guard(console_lock)(); return vt_kdsetmode(vc, arg); } case KDGETMODE: return put_user(vc->vc_mode, (int __user *)arg); case KDMAPDISP: case KDUNMAPDISP: /* * these work like a combination of mmap and KDENABIO. * this could be easily finished. */ return -EINVAL; case KDSKBMODE: if (!perm) return -EPERM; ret = vt_do_kdskbmode(console, arg); if (ret) return ret; tty_ldisc_flush(tty); break; case KDGKBMODE: return put_user(vt_do_kdgkbmode(console), (int __user *)arg); /* this could be folded into KDSKBMODE, but for compatibility reasons it is not so easy to fold KDGKBMETA into KDGKBMODE */ case KDSKBMETA: return vt_do_kdskbmeta(console, arg); case KDGKBMETA: /* FIXME: should review whether this is worth locking */ return put_user(vt_do_kdgkbmeta(console), (int __user *)arg); case KDGETKEYCODE: case KDSETKEYCODE: if(!capable(CAP_SYS_TTY_CONFIG)) perm = 0; return vt_do_kbkeycode_ioctl(cmd, up, perm); case KDGKBENT: case KDSKBENT: return vt_do_kdsk_ioctl(cmd, up, perm, console); case KDGKBSENT: case KDSKBSENT: return vt_do_kdgkb_ioctl(cmd, up, perm); /* Diacritical processing. Handled in keyboard.c as it has to operate on the keyboard locks and structures */ case KDGKBDIACR: case KDGKBDIACRUC: case KDSKBDIACR: case KDSKBDIACRUC: return vt_do_diacrit(cmd, up, perm); /* the ioctls below read/set the flags usually shown in the leds */ /* don't use them - they will go away without warning */ case KDGKBLED: case KDSKBLED: case KDGETLED: case KDSETLED: return vt_do_kdskled(console, cmd, arg, perm); /* * A process can indicate its willingness to accept signals * generated by pressing an appropriate key combination. * Thus, one can have a daemon that e.g. spawns a new console * upon a keypress and then changes to it. * See also the kbrequest field of inittab(5). */ case KDSIGACCEPT: if (!perm || !capable(CAP_KILL)) return -EPERM; if (!valid_signal(arg) || arg < 1 || arg == SIGKILL) return -EINVAL; spin_lock_irq(&vt_spawn_con.lock); put_pid(vt_spawn_con.pid); vt_spawn_con.pid = get_pid(task_pid(current)); vt_spawn_con.sig = arg; spin_unlock_irq(&vt_spawn_con.lock); break; case KDFONTOP: { struct console_font_op op; if (copy_from_user(&op, up, sizeof(op))) return -EFAULT; if (!perm && op.op != KD_FONT_OP_GET) return -EPERM; ret = con_font_op(vc, &op); if (ret) return ret; if (copy_to_user(up, &op, sizeof(op))) return -EFAULT; break; } default: return -ENOIOCTLCMD; } return 0; } static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud, bool perm, struct vc_data *vc) { struct unimapdesc tmp; if (copy_from_user(&tmp, user_ud, sizeof tmp)) return -EFAULT; switch (cmd) { case PIO_UNIMAP: if (!perm) return -EPERM; return con_set_unimap(vc, tmp.entry_ct, tmp.entries); case GIO_UNIMAP: if (!perm && fg_console != vc->vc_num) return -EPERM; return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), tmp.entries); } return 0; } static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up, bool perm) { switch (cmd) { case PIO_CMAP: if (!perm) return -EPERM; return con_set_cmap(up); case GIO_CMAP: return con_get_cmap(up); case PIO_SCRNMAP: if (!perm) return -EPERM; return con_set_trans_old(up); case GIO_SCRNMAP: return con_get_trans_old(up); case PIO_UNISCRNMAP: if (!perm) return -EPERM; return con_set_trans_new(up); case GIO_UNISCRNMAP: return con_get_trans_new(up); case PIO_UNIMAPCLR: if (!perm) return -EPERM; con_clear_unimap(vc); break; case PIO_UNIMAP: case GIO_UNIMAP: return do_unimap_ioctl(cmd, up, perm, vc); default: return -ENOIOCTLCMD; } return 0; } static int vt_reldisp(struct vc_data *vc, unsigned int swtch) { int newvt, ret; if (vc->vt_mode.mode != VT_PROCESS) return -EINVAL; /* Switched-to response */ if (vc->vt_newvt < 0) { /* If it's just an ACK, ignore it */ return swtch == VT_ACKACQ ? 0 : -EINVAL; } /* Switching-from response */ if (swtch == 0) { /* Switch disallowed, so forget we were trying to do it. */ vc->vt_newvt = -1; return 0; } /* The current vt has been released, so complete the switch. */ newvt = vc->vt_newvt; vc->vt_newvt = -1; ret = vc_allocate(newvt); if (ret) return ret; /* * When we actually do the console switch, make sure we are atomic with * respect to other console switches.. */ complete_change_console(vc_cons[newvt].d); return 0; } static int vt_setactivate(struct vt_setactivate __user *sa) { struct vt_setactivate vsa; struct vc_data *nvc; int ret; if (copy_from_user(&vsa, sa, sizeof(vsa))) return -EFAULT; if (vsa.console == 0 || vsa.console > MAX_NR_CONSOLES) return -ENXIO; vsa.console--; vsa.console = array_index_nospec(vsa.console, MAX_NR_CONSOLES); scoped_guard(console_lock) { ret = vc_allocate(vsa.console); if (ret) return ret; /* * This is safe providing we don't drop the console sem between * vc_allocate and finishing referencing nvc. */ nvc = vc_cons[vsa.console].d; nvc->vt_mode = vsa.mode; nvc->vt_mode.frsig = 0; put_pid(nvc->vt_pid); nvc->vt_pid = get_pid(task_pid(current)); } /* Commence switch and lock */ /* Review set_console locks */ set_console(vsa.console); return 0; } /* deallocate a single console, if possible (leave 0) */ static int vt_disallocate(unsigned int vc_num) { struct vc_data *vc = NULL; scoped_guard(console_lock) { if (vt_busy(vc_num)) return -EBUSY; if (vc_num) vc = vc_deallocate(vc_num); } if (vc && vc_num >= MIN_NR_CONSOLES) tty_port_put(&vc->port); return 0; } /* deallocate all unused consoles, but leave 0 */ static void vt_disallocate_all(void) { struct vc_data *vc[MAX_NR_CONSOLES]; int i; scoped_guard(console_lock) for (i = 1; i < MAX_NR_CONSOLES; i++) if (!vt_busy(i)) vc[i] = vc_deallocate(i); else vc[i] = NULL; for (i = 1; i < MAX_NR_CONSOLES; i++) { if (vc[i] && i >= MIN_NR_CONSOLES) tty_port_put(&vc[i]->port); } } static int vt_resizex(struct vc_data *vc, struct vt_consize __user *cs) { struct vt_consize v; int i; if (copy_from_user(&v, cs, sizeof(struct vt_consize))) return -EFAULT; /* FIXME: Should check the copies properly */ if (!v.v_vlin) v.v_vlin = vc->vc_scan_lines; if (v.v_clin) { int rows = v.v_vlin / v.v_clin; if (v.v_rows != rows) { if (v.v_rows) /* Parameters don't add up */ return -EINVAL; v.v_rows = rows; } } if (v.v_vcol && v.v_ccol) { int cols = v.v_vcol / v.v_ccol; if (v.v_cols != cols) { if (v.v_cols) return -EINVAL; v.v_cols = cols; } } if (v.v_clin > 32) return -EINVAL; for (i = 0; i < MAX_NR_CONSOLES; i++) { struct vc_data *vcp; if (!vc_cons[i].d) continue; guard(console_lock)(); vcp = vc_cons[i].d; if (vcp) { int ret; int save_scan_lines = vcp->vc_scan_lines; int save_cell_height = vcp->vc_cell_height; if (v.v_vlin) vcp->vc_scan_lines = v.v_vlin; if (v.v_clin) vcp->vc_cell_height = v.v_clin; ret = __vc_resize(vcp, v.v_cols, v.v_rows, true); if (ret) { vcp->vc_scan_lines = save_scan_lines; vcp->vc_cell_height = save_cell_height; return ret; } } } return 0; } /* * We handle the console-specific ioctl's here. We allow the * capability to modify any console, not just the fg_console. */ int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct vc_data *vc = tty->driver_data; void __user *up = (void __user *)arg; int i, perm; int ret; /* * To have permissions to do most of the vt ioctls, we either have * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. */ perm = 0; if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) perm = 1; ret = vt_k_ioctl(tty, cmd, arg, perm); if (ret != -ENOIOCTLCMD) return ret; ret = vt_io_ioctl(vc, cmd, up, perm); if (ret != -ENOIOCTLCMD) return ret; switch (cmd) { case TIOCLINUX: return tioclinux(tty, arg); case VT_SETMODE: { struct vt_mode tmp; if (!perm) return -EPERM; if (copy_from_user(&tmp, up, sizeof(struct vt_mode))) return -EFAULT; if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS) return -EINVAL; guard(console_lock)(); vc->vt_mode = tmp; /* the frsig is ignored, so we set it to 0 */ vc->vt_mode.frsig = 0; put_pid(vc->vt_pid); vc->vt_pid = get_pid(task_pid(current)); /* no switch is required -- saw@shade.msu.ru */ vc->vt_newvt = -1; break; } case VT_GETMODE: { struct vt_mode tmp; int rc; scoped_guard(console_lock) memcpy(&tmp, &vc->vt_mode, sizeof(struct vt_mode)); rc = copy_to_user(up, &tmp, sizeof(struct vt_mode)); if (rc) return -EFAULT; break; } /* * Returns global vt state. Note that VT 0 is always open, since * it's an alias for the current VT, and people can't use it here. * We cannot return state for more than 16 VTs, since v_state is short. */ case VT_GETSTATE: { struct vt_stat __user *vtstat = up; unsigned short state, mask; if (put_user(fg_console + 1, &vtstat->v_active)) return -EFAULT; state = 1; /* /dev/tty0 is always open */ scoped_guard(console_lock) /* required by vt_in_use() */ for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1) if (vt_in_use(i)) state |= mask; return put_user(state, &vtstat->v_state); } /* * Returns the first available (non-opened) console. */ case VT_OPENQRY: scoped_guard(console_lock) /* required by vt_in_use() */ for (i = 0; i < MAX_NR_CONSOLES; ++i) if (!vt_in_use(i)) break; i = i < MAX_NR_CONSOLES ? (i+1) : -1; return put_user(i, (int __user *)arg); /* * ioctl(fd, VT_ACTIVATE, num) will cause us to switch to vt # num, * with num >= 1 (switches to vt 0, our console, are not allowed, just * to preserve sanity). */ case VT_ACTIVATE: if (!perm) return -EPERM; if (arg == 0 || arg > MAX_NR_CONSOLES) return -ENXIO; arg--; arg = array_index_nospec(arg, MAX_NR_CONSOLES); scoped_guard(console_lock) { ret = vc_allocate(arg); if (ret) return ret; } set_console(arg); break; case VT_SETACTIVATE: if (!perm) return -EPERM; return vt_setactivate(up); /* * wait until the specified VT has been activated */ case VT_WAITACTIVE: if (!perm) return -EPERM; if (arg == 0 || arg > MAX_NR_CONSOLES) return -ENXIO; return vt_waitactive(arg); /* * If a vt is under process control, the kernel will not switch to it * immediately, but postpone the operation until the process calls this * ioctl, allowing the switch to complete. * * According to the X sources this is the behavior: * 0: pending switch-from not OK * 1: pending switch-from OK * 2: completed switch-to OK */ case VT_RELDISP: { if (!perm) return -EPERM; guard(console_lock)(); return vt_reldisp(vc, arg); } /* * Disallocate memory associated to VT (but leave VT1) */ case VT_DISALLOCATE: if (arg > MAX_NR_CONSOLES) return -ENXIO; if (arg == 0) { vt_disallocate_all(); break; } arg = array_index_nospec(arg - 1, MAX_NR_CONSOLES); return vt_disallocate(arg); case VT_RESIZE: { struct vt_sizes __user *vtsizes = up; struct vc_data *vc; ushort ll,cc; if (!perm) return -EPERM; if (get_user(ll, &vtsizes->v_rows) || get_user(cc, &vtsizes->v_cols)) return -EFAULT; guard(console_lock)(); for (i = 0; i < MAX_NR_CONSOLES; i++) { vc = vc_cons[i].d; if (vc) { /* FIXME: review v tty lock */ ret = __vc_resize(vc_cons[i].d, cc, ll, true); if (ret) return ret; } } break; } case VT_RESIZEX: if (!perm) return -EPERM; return vt_resizex(vc, up); case VT_LOCKSWITCH: if (!capable(CAP_SYS_TTY_CONFIG)) return -EPERM; vt_dont_switch = true; break; case VT_UNLOCKSWITCH: if (!capable(CAP_SYS_TTY_CONFIG)) return -EPERM; vt_dont_switch = false; break; case VT_GETHIFONTMASK: return put_user(vc->vc_hi_font_mask, (unsigned short __user *)arg); case VT_WAITEVENT: return vt_event_wait_ioctl((struct vt_event __user *)arg); case VT_GETCONSIZECSRPOS: { struct vt_consizecsrpos concsr; console_lock(); concsr.con_cols = vc->vc_cols; concsr.con_rows = vc->vc_rows; concsr.csr_col = vc->state.x; concsr.csr_row = vc->state.y; console_unlock(); if (copy_to_user(up, &concsr, sizeof(concsr))) return -EFAULT; return 0; } default: return -ENOIOCTLCMD; } return 0; } void reset_vc(struct vc_data *vc) { vc->vc_mode = KD_TEXT; vt_reset_unicode(vc->vc_num); vc->vt_mode.mode = VT_AUTO; vc->vt_mode.waitv = 0; vc->vt_mode.relsig = 0; vc->vt_mode.acqsig = 0; vc->vt_mode.frsig = 0; put_pid(vc->vt_pid); vc->vt_pid = NULL; vc->vt_newvt = -1; reset_palette(vc); } void vc_SAK(struct work_struct *work) { struct vc *vc_con = container_of(work, struct vc, SAK_work); struct vc_data *vc; struct tty_struct *tty; guard(console_lock)(); vc = vc_con->d; if (!vc) return; /* FIXME: review tty ref counting */ tty = vc->port.tty; /* SAK should also work in all raw modes and reset them properly. */ if (tty) __do_SAK(tty); reset_vc(vc); } #ifdef CONFIG_COMPAT struct compat_console_font_op { compat_uint_t op; /* operation code KD_FONT_OP_* */ compat_uint_t flags; /* KD_FONT_FLAG_* */ compat_uint_t width, height; /* font size */ compat_uint_t charcount; compat_caddr_t data; /* font data with height fixed to 32 */ }; static inline int compat_kdfontop_ioctl(struct compat_console_font_op __user *fontop, int perm, struct console_font_op *op, struct vc_data *vc) { int i; if (copy_from_user(op, fontop, sizeof(struct compat_console_font_op))) return -EFAULT; if (!perm && op->op != KD_FONT_OP_GET) return -EPERM; op->data = compat_ptr(((struct compat_console_font_op *)op)->data); i = con_font_op(vc, op); if (i) return i; ((struct compat_console_font_op *)op)->data = (unsigned long)op->data; if (copy_to_user(fontop, op, sizeof(struct compat_console_font_op))) return -EFAULT; return 0; } struct compat_unimapdesc { unsigned short entry_ct; compat_caddr_t entries; }; static inline int compat_unimap_ioctl(unsigned int cmd, struct compat_unimapdesc __user *user_ud, int perm, struct vc_data *vc) { struct compat_unimapdesc tmp; struct unipair __user *tmp_entries; if (copy_from_user(&tmp, user_ud, sizeof tmp)) return -EFAULT; tmp_entries = compat_ptr(tmp.entries); switch (cmd) { case PIO_UNIMAP: if (!perm) return -EPERM; return con_set_unimap(vc, tmp.entry_ct, tmp_entries); case GIO_UNIMAP: if (!perm && fg_console != vc->vc_num) return -EPERM; return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), tmp_entries); } return 0; } long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct vc_data *vc = tty->driver_data; struct console_font_op op; /* used in multiple places here */ void __user *up = compat_ptr(arg); int perm; /* * To have permissions to do most of the vt ioctls, we either have * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. */ perm = 0; if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) perm = 1; switch (cmd) { /* * these need special handlers for incompatible data structures */ case KDFONTOP: return compat_kdfontop_ioctl(up, perm, &op, vc); case PIO_UNIMAP: case GIO_UNIMAP: return compat_unimap_ioctl(cmd, up, perm, vc); /* * all these treat 'arg' as an integer */ case KIOCSOUND: case KDMKTONE: #ifdef CONFIG_X86 case KDADDIO: case KDDELIO: #endif case KDSETMODE: case KDMAPDISP: case KDUNMAPDISP: case KDSKBMODE: case KDSKBMETA: case KDSKBLED: case KDSETLED: case KDSIGACCEPT: case VT_ACTIVATE: case VT_WAITACTIVE: case VT_RELDISP: case VT_DISALLOCATE: return vt_ioctl(tty, cmd, arg); /* * the rest has a compatible data structure behind arg, * but we have to convert it to a proper 64 bit pointer. */ default: return vt_ioctl(tty, cmd, (unsigned long)up); } } #endif /* CONFIG_COMPAT */ /* * Performs the back end of a vt switch. Called under the console * semaphore. */ static void complete_change_console(struct vc_data *vc) { unsigned char old_vc_mode; int old = fg_console; last_console = fg_console; /* * If we're switching, we could be going from KD_GRAPHICS to * KD_TEXT mode or vice versa, which means we need to blank or * unblank the screen later. */ old_vc_mode = vc_cons[fg_console].d->vc_mode; switch_screen(vc); /* * This can't appear below a successful kill_pid(). If it did, * then the *blank_screen operation could occur while X, having * received acqsig, is waking up on another processor. This * condition can lead to overlapping accesses to the VGA range * and the framebuffer (causing system lockups). * * To account for this we duplicate this code below only if the * controlling process is gone and we've called reset_vc. */ if (old_vc_mode != vc->vc_mode) { if (vc->vc_mode == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); } /* * If this new console is under process control, send it a signal * telling it that it has acquired. Also check if it has died and * clean up (similar to logic employed in change_console()) */ if (vc->vt_mode.mode == VT_PROCESS) { /* * Send the signal as privileged - kill_pid() will * tell us if the process has gone or something else * is awry */ if (kill_pid(vc->vt_pid, vc->vt_mode.acqsig, 1) != 0) { /* * The controlling process has died, so we revert back to * normal operation. In this case, we'll also change back * to KD_TEXT mode. I'm not sure if this is strictly correct * but it saves the agony when the X server dies and the screen * remains blanked due to KD_GRAPHICS! It would be nice to do * this outside of VT_PROCESS but there is no single process * to account for and tracking tty count may be undesirable. */ reset_vc(vc); if (old_vc_mode != vc->vc_mode) { if (vc->vc_mode == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); } } } /* * Wake anyone waiting for their VT to activate */ vt_event_post(VT_EVENT_SWITCH, old, vc->vc_num); return; } /* * Performs the front-end of a vt switch */ void change_console(struct vc_data *new_vc) { struct vc_data *vc; if (!new_vc || new_vc->vc_num == fg_console || vt_dont_switch) return; /* * If this vt is in process mode, then we need to handshake with * that process before switching. Essentially, we store where that * vt wants to switch to and wait for it to tell us when it's done * (via VT_RELDISP ioctl). * * We also check to see if the controlling process still exists. * If it doesn't, we reset this vt to auto mode and continue. * This is a cheap way to track process control. The worst thing * that can happen is: we send a signal to a process, it dies, and * the switch gets "lost" waiting for a response; hopefully, the * user will try again, we'll detect the process is gone (unless * the user waits just the right amount of time :-) and revert the * vt to auto control. */ vc = vc_cons[fg_console].d; if (vc->vt_mode.mode == VT_PROCESS) { /* * Send the signal as privileged - kill_pid() will * tell us if the process has gone or something else * is awry. * * We need to set vt_newvt *before* sending the signal or we * have a race. */ vc->vt_newvt = new_vc->vc_num; if (kill_pid(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) { /* * It worked. Mark the vt to switch to and * return. The process needs to send us a * VT_RELDISP ioctl to complete the switch. */ return; } /* * The controlling process has died, so we revert back to * normal operation. In this case, we'll also change back * to KD_TEXT mode. I'm not sure if this is strictly correct * but it saves the agony when the X server dies and the screen * remains blanked due to KD_GRAPHICS! It would be nice to do * this outside of VT_PROCESS but there is no single process * to account for and tracking tty count may be undesirable. */ reset_vc(vc); /* * Fall through to normal (VT_AUTO) handling of the switch... */ } /* * Ignore all switches in KD_GRAPHICS+VT_AUTO mode */ if (vc->vc_mode == KD_GRAPHICS) return; complete_change_console(new_vc); } /* Perform a kernel triggered VT switch for suspend/resume */ static int disable_vt_switch; int vt_move_to_console(unsigned int vt, int alloc) { int prev; scoped_guard(console_lock) { /* Graphics mode - up to X */ if (disable_vt_switch) return 0; prev = fg_console; if (alloc && vc_allocate(vt)) { /* * We can't have a free VC for now. Too bad, we don't want to mess the * screen for now. */ return -ENOSPC; } if (set_console(vt)) { /* * We're unable to switch to the SUSPEND_CONSOLE. Let the calling function * know so it can decide what to do. */ return -EIO; } } if (vt_waitactive(vt + 1)) { pr_debug("Suspend: Can't switch VCs."); return -EINTR; } return prev; } /* * Normally during a suspend, we allocate a new console and switch to it. * When we resume, we switch back to the original console. This switch * can be slow, so on systems where the framebuffer can handle restoration * of video registers anyways, there's little point in doing the console * switch. This function allows you to disable it by passing it '0'. */ void pm_set_vt_switch(int do_switch) { guard(console_lock)(); disable_vt_switch = !do_switch; } EXPORT_SYMBOL(pm_set_vt_switch); |
| 11 1 5 3 2 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) /* This mutex is used to * - protect race between prog/link attach/detach and link prog update, and * - protect race between releasing and accessing map in bpf_link. * A single global mutex lock is used since it is expected contention is low. */ static DEFINE_MUTEX(sockmap_mutex); static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_map *map; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { struct bpf_prog *prog; struct bpf_map *map; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); break; } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { if (sk_is_tcp(sk)) ret = sk_psock_init_strp(sk, psock); else ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk = NULL; int err = 0; spin_lock_bh(&stab->lock); if (!sk_test || sk_test == *psk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; if (sk_is_vsock(sk) && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; if (sk_is_vsock(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } cond_resched(); } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; if (sk_is_vsock(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **cur_pprog; struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: cur_pprog = &progs->msg_parser; cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: cur_pprog = &progs->stream_parser; cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; cur_pprog = &progs->stream_verdict; cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; cur_pprog = &progs->skb_verdict; cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } *pprog = cur_pprog; if (plink) *plink = cur_plink; return 0; } /* Handle the following four cases: * prog_attach: prog != NULL, old == NULL, link == NULL * prog_detach: prog == NULL, old != NULL, link == NULL * link_attach: prog != NULL, old == NULL, link != NULL * link_detach: prog == NULL, old != NULL, link != NULL */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, struct bpf_link *link, u32 which) { struct bpf_prog **pprog; struct bpf_link **plink; int ret; ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; /* for prog_attach/prog_detach/link_attach, return error if a bpf_link * exists for that prog. */ if ((!link || prog) && *plink) return -EBUSY; if (old) { ret = psock_replace_prog(pprog, prog, old); if (!ret) *plink = NULL; } else { psock_set_prog(pprog, prog); if (link) *plink = link; } return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; CLASS(fd, f)(attr->target_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); psock = sk_psock_get(sk); if (unlikely(!psock)) goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } else { saved_close = READ_ONCE(sk->sk_prot)->close; no_psock: rcu_read_unlock(); release_sock(sk); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); struct sockmap_link { struct bpf_link link; struct bpf_map *map; }; static void sock_map_link_release(struct bpf_link *link) { struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); mutex_lock(&sockmap_mutex); if (!sockmap_link->map) goto out; WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, link->attach_type)); bpf_map_put_with_uref(sockmap_link->map); sockmap_link->map = NULL; out: mutex_unlock(&sockmap_mutex); } static int sock_map_link_detach(struct bpf_link *link) { sock_map_link_release(link); return 0; } static void sock_map_link_dealloc(struct bpf_link *link) { kfree(link); } /* Handle the following two cases: * case 1: link != NULL, prog != NULL, old != NULL * case 2: link != NULL, prog != NULL, old == NULL */ static int sock_map_link_update_prog(struct bpf_link *link, struct bpf_prog *prog, struct bpf_prog *old) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); struct bpf_prog **pprog, *old_link_prog; struct bpf_link **plink; int ret = 0; mutex_lock(&sockmap_mutex); /* If old prog is not NULL, ensure old prog is the same as link->prog. */ if (old && link->prog != old) { ret = -EPERM; goto out; } /* Ensure link->prog has the same type/attach_type as the new prog. */ if (link->prog->type != prog->type || link->prog->expected_attach_type != prog->expected_attach_type) { ret = -EINVAL; goto out; } if (!sockmap_link->map) { ret = -ENOLINK; goto out; } ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, link->attach_type); if (ret) goto out; /* return error if the stored bpf_link does not match the incoming bpf_link. */ if (link != *plink) { ret = -EBUSY; goto out; } if (old) { ret = psock_replace_prog(pprog, prog, old); if (ret) goto out; } else { psock_set_prog(pprog, prog); } bpf_prog_inc(prog); old_link_prog = xchg(&link->prog, prog); bpf_prog_put(old_link_prog); out: mutex_unlock(&sockmap_mutex); return ret; } static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) { u32 map_id = 0; mutex_lock(&sockmap_mutex); if (sockmap_link->map) map_id = sockmap_link->map->id; mutex_unlock(&sockmap_mutex); return map_id; } static int sock_map_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); info->sockmap.map_id = map_id; info->sockmap.attach_type = link->attach_type; return 0; } static void sock_map_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); u32 map_id = sock_map_link_get_map_id(sockmap_link); seq_printf(seq, "map_id:\t%u\n", map_id); seq_printf(seq, "attach_type:\t%u\n", link->attach_type); } static const struct bpf_link_ops sock_map_link_ops = { .release = sock_map_link_release, .dealloc = sock_map_link_dealloc, .detach = sock_map_link_detach, .update_prog = sock_map_link_update_prog, .fill_link_info = sock_map_link_fill_info, .show_fdinfo = sock_map_link_show_fdinfo, }; int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct sockmap_link *sockmap_link; enum bpf_attach_type attach_type; struct bpf_map *map; int ret; if (attr->link_create.flags) return -EINVAL; map = bpf_map_get_with_uref(attr->link_create.target_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { ret = -EINVAL; goto out; } sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); if (!sockmap_link) { ret = -ENOMEM; goto out; } attach_type = attr->link_create.attach_type; bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, attach_type); sockmap_link->map = map; ret = bpf_link_prime(&sockmap_link->link, &link_primer); if (ret) { kfree(sockmap_link); goto out; } mutex_lock(&sockmap_mutex); ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); mutex_unlock(&sockmap_mutex); if (ret) { bpf_link_cleanup(&link_primer); goto out; } /* Increase refcnt for the prog since when old prog is replaced with * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. * * Actually, we do not need to increase refcnt for the prog since bpf_link * will hold a reference. But in order to have less complexity w.r.t. * replacing/setting prog, let us increase the refcnt to make things simpler. */ bpf_prog_inc(prog); return bpf_link_settle(&link_primer); out: bpf_map_put_with_uref(map); return ret; } static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init); |
| 44 45 45 45 44 5703 5704 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); wakeup_attr(relax_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_relax_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init); |
| 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | /* * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/kernel.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include "ipoib.h" struct ipoib_stats { char stat_string[ETH_GSTRING_LEN]; int stat_offset; }; #define IPOIB_NETDEV_STAT(m) { \ .stat_string = #m, \ .stat_offset = offsetof(struct rtnl_link_stats64, m) } static const struct ipoib_stats ipoib_gstrings_stats[] = { IPOIB_NETDEV_STAT(rx_packets), IPOIB_NETDEV_STAT(tx_packets), IPOIB_NETDEV_STAT(rx_bytes), IPOIB_NETDEV_STAT(tx_bytes), IPOIB_NETDEV_STAT(tx_errors), IPOIB_NETDEV_STAT(rx_dropped), IPOIB_NETDEV_STAT(tx_dropped), IPOIB_NETDEV_STAT(multicast), }; #define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { struct ipoib_dev_priv *priv = ipoib_priv(netdev); ib_get_device_fw_str(priv->ca, drvinfo->fw_version); strscpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent), sizeof(drvinfo->bus_info)); strscpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); } static int ipoib_get_coalesce(struct net_device *dev, struct ethtool_coalesce *coal, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct ipoib_dev_priv *priv = ipoib_priv(dev); coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; return 0; } static int ipoib_set_coalesce(struct net_device *dev, struct ethtool_coalesce *coal, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret; /* * These values are saved in the private data and returned * when ipoib_get_coalesce() is called */ if (coal->rx_coalesce_usecs > 0xffff || coal->rx_max_coalesced_frames > 0xffff) return -EINVAL; ret = rdma_set_cq_moderation(priv->recv_cq, coal->rx_max_coalesced_frames, coal->rx_coalesce_usecs); if (ret && ret != -EOPNOTSUPP) { ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); return ret; } priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; return 0; } static void ipoib_get_ethtool_stats(struct net_device *dev, struct ethtool_stats __always_unused *stats, u64 *data) { int i; struct net_device_stats *net_stats = &dev->stats; u8 *p = (u8 *)net_stats; for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset); } static void ipoib_get_strings(struct net_device __always_unused *dev, u32 stringset, u8 *data) { int i; switch (stringset) { case ETH_SS_STATS: for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) ethtool_puts(&data, ipoib_gstrings_stats[i].stat_string); break; default: break; } } static int ipoib_get_sset_count(struct net_device __always_unused *dev, int sset) { switch (sset) { case ETH_SS_STATS: return IPOIB_GLOBAL_STATS_LEN; default: break; } return -EOPNOTSUPP; } /* Return lane speed in unit of 1e6 bit/sec */ static inline int ib_speed_enum_to_int(int speed) { switch (speed) { case IB_SPEED_SDR: return SPEED_2500; case IB_SPEED_DDR: return SPEED_5000; case IB_SPEED_QDR: case IB_SPEED_FDR10: return SPEED_10000; case IB_SPEED_FDR: return SPEED_14000; case IB_SPEED_EDR: return SPEED_25000; case IB_SPEED_HDR: return SPEED_50000; case IB_SPEED_NDR: return SPEED_100000; case IB_SPEED_XDR: return SPEED_200000; } return SPEED_UNKNOWN; } static int ipoib_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { struct ipoib_dev_priv *priv = ipoib_priv(netdev); struct ib_port_attr attr; int ret, speed, width; if (!netif_carrier_ok(netdev)) { cmd->base.speed = SPEED_UNKNOWN; cmd->base.duplex = DUPLEX_UNKNOWN; return 0; } ret = ib_query_port(priv->ca, priv->port, &attr); if (ret < 0) return -EINVAL; speed = ib_speed_enum_to_int(attr.active_speed); width = ib_width_enum_to_int(attr.active_width); if (speed < 0 || width < 0) return -EINVAL; /* Except the following are set, the other members of * the struct ethtool_link_settings are initialized to * zero in the function __ethtool_get_link_ksettings. */ cmd->base.speed = speed * width; cmd->base.duplex = DUPLEX_FULL; cmd->base.phy_address = 0xFF; cmd->base.autoneg = AUTONEG_ENABLE; cmd->base.port = PORT_OTHER; return 0; } static const struct ethtool_ops ipoib_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_link_ksettings = ipoib_get_link_ksettings, .get_drvinfo = ipoib_get_drvinfo, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, .get_strings = ipoib_get_strings, .get_ethtool_stats = ipoib_get_ethtool_stats, .get_sset_count = ipoib_get_sset_count, .get_link = ethtool_op_get_link, }; void ipoib_set_ethtool_ops(struct net_device *dev) { dev->ethtool_ops = &ipoib_ethtool_ops; } |
| 180 10 10 3 7 52 52 1 52 1 9 3 6 9 9 39 1 38 38 11 8 3 11 9 9 9 4 8 9 9 40 21 21 21 19 21 18 4 145 145 2 2 5 7 6 1 5 2 3 2 1 1 5 5 31 1 30 30 17 1 1 35 45 2 1 18 24 1 15 32 1 47 4 4 4 2 3 12 12 3 10 1 1 5 7 7 4 4 4 9 9 6 4 2 2 2 1 5 2 3 9 13 13 2 11 8 4 3 3 5 2 5 2 2 12 5 2 7 4 5 4 45 1 30 7 4 5 5 4 47 38 1 37 37 11 1 30 3 30 5 30 4 33 12 38 18 30 15 15 8 8 11 2 4 6 15 15 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 | // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/shm.c * Copyright (C) 1992, 1993 Krishna Balasubramanian * Many improvements/fixes by Bruno Haible. * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994. * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli. * * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * BIGMEM support, Andrea Arcangeli <andrea@suse.de> * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr> * HIGHMEM support, Ingo Molnar <mingo@redhat.com> * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com> * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com> * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * Better ipc lock (kern_ipc_perm.lock) handling * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013. */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <uapi/linux/shm.h> #include <linux/init.h> #include <linux/file.h> #include <linux/mman.h> #include <linux/shmem_fs.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/mount.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/nstree.h> #include <linux/uaccess.h> #include "util.h" struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; struct file *shm_file; unsigned long shm_nattch; unsigned long shm_segsz; time64_t shm_atim; time64_t shm_dtim; time64_t shm_ctim; struct pid *shm_cprid; struct pid *shm_lprid; struct ucounts *mlock_ucounts; /* * The task created the shm object, for * task_lock(shp->shm_creator) */ struct task_struct *shm_creator; /* * List by creator. task_lock(->shm_creator) required for read/write. * If list_empty(), then the creator is dead already. */ struct list_head shm_clist; struct ipc_namespace *ns; } __randomize_layout; /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ #define SHM_LOCKED 02000 /* segment will not be swapped */ struct shm_file_data { int id; struct ipc_namespace *ns; struct file *file; const struct vm_operations_struct *vm_ops; }; #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) static const struct file_operations shm_file_operations; static const struct vm_operations_struct shm_vm_ops; #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) #define shm_unlock(shp) \ ipc_unlock(&(shp)->shm_perm) static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); static void shm_close(struct vm_area_struct *vma); static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp); #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif void shm_init_ns(struct ipc_namespace *ns) { ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; ns->shm_rmid_forced = 0; ns->shm_tot = 0; ipc_init_ids(&shm_ids(ns)); } /* * Called with shm_ids.rwsem (writer) and the shp structure locked. * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); WARN_ON(ns != shp->ns); if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; /* Do not find it any more */ ipc_set_key_private(&shm_ids(ns), &shp->shm_perm); shm_unlock(shp); } else shm_destroy(ns, shp); } #ifdef CONFIG_IPC_NS void shm_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &shm_ids(ns), do_shm_rmid); idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht); } #endif static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); ns_tree_add(&init_ipc_ns); return 0; } pure_initcall(ipc_ns_init); void __init shm_init(void) { ipc_init_proc_interface("sysvipc/shm", #if BITS_PER_LONG <= 32 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", #else " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", #endif IPC_SHM_IDS, sysvipc_shm_proc_show); } static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct shmid_kernel, shm_perm); } static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct shmid_kernel, shm_perm); } /* * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp; rcu_read_lock(); ipcp = ipc_obtain_object_idr(&shm_ids(ns), id); if (IS_ERR(ipcp)) goto err; ipc_lock_object(ipcp); /* * ipc_rmid() may have already freed the ID while ipc_lock_object() * was spinning: here verify that the structure is still valid. * Upon races with RMID, return -EIDRM, thus indicating that * the ID points to a removed identifier. */ if (ipc_valid_object(ipcp)) { /* return a locked ipc object upon success */ return container_of(ipcp, struct shmid_kernel, shm_perm); } ipc_unlock_object(ipcp); ipcp = ERR_PTR(-EIDRM); err: rcu_read_unlock(); /* * Callers of shm_lock() must validate the status of the returned ipc * object pointer and error out as appropriate. */ return ERR_CAST(ipcp); } static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) { rcu_read_lock(); ipc_lock_object(&ipcp->shm_perm); } static void shm_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *ptr = container_of(head, struct kern_ipc_perm, rcu); struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel, shm_perm); security_shm_free(&shp->shm_perm); kfree(shp); } /* * It has to be called with shp locked. * It must be called before ipc_rmid() */ static inline void shm_clist_rm(struct shmid_kernel *shp) { struct task_struct *creator; /* ensure that shm_creator does not disappear */ rcu_read_lock(); /* * A concurrent exit_shm may do a list_del_init() as well. * Just do nothing if exit_shm already did the work */ if (!list_empty(&shp->shm_clist)) { /* * shp->shm_creator is guaranteed to be valid *only* * if shp->shm_clist is not empty. */ creator = shp->shm_creator; task_lock(creator); /* * list_del_init() is a nop if the entry was already removed * from the list. */ list_del_init(&shp->shm_clist); task_unlock(creator); } rcu_read_unlock(); } static inline void shm_rmid(struct shmid_kernel *s) { shm_clist_rm(s); ipc_rmid(&shm_ids(s->ns), &s->shm_perm); } static int __shm_open(struct shm_file_data *sfd) { struct shmid_kernel *shp; shp = shm_lock(sfd->ns, sfd->id); if (IS_ERR(shp)) return PTR_ERR(shp); if (shp->shm_file != sfd->file) { /* ID was reused */ shm_unlock(shp); return -EINVAL; } shp->shm_atim = ktime_get_real_seconds(); ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_nattch++; shm_unlock(shp); return 0; } /* This is called by fork, once for every shm attach. */ static void shm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); int err; /* Always call underlying open if present */ if (sfd->vm_ops->open) sfd->vm_ops->open(vma); err = __shm_open(sfd); /* * We raced in the idr lookup or with shm_destroy(). * Either way, the ID is busted. */ WARN_ON_ONCE(err); } /* * shm_destroy - free the struct shmid_kernel * * @ns: namespace * @shp: struct to free * * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { struct file *shm_file; shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) shmem_lock(shm_file, 0, shp->mlock_ucounts); fput(shm_file); ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); } /* * shm_may_destroy - identifies whether shm segment should be destroyed now * * Returns true if and only if there are no active users of the segment and * one of the following is true: * * 1) shmctl(id, IPC_RMID, NULL) was called for this shp * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); } /* * remove the attach descriptor vma. * free memory for segment if it is marked destroyed. * The descriptor has already been removed from the current->mm->mmap list * and will later be kfree()d. */ static void __shm_close(struct shm_file_data *sfd) { struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ shp = shm_lock(ns, sfd->id); /* * We raced in the idr lookup or with shm_destroy(). * Either way, the ID is busted. */ if (WARN_ON_ONCE(IS_ERR(shp))) goto done; /* no-op */ ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); done: up_write(&shm_ids(ns).rwsem); } static void shm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); /* Always call underlying close if present */ if (sfd->vm_ops->close) sfd->vm_ops->close(vma); __shm_close(sfd); } /* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; struct kern_ipc_perm *ipcp = p; struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); /* * We want to destroy segments without users and with already * exit'ed originating process. * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ if (!list_empty(&shp->shm_clist)) return 0; if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } return 0; } void shm_destroy_orphaned(struct ipc_namespace *ns) { down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) { rcu_read_lock(); idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); rcu_read_unlock(); } up_write(&shm_ids(ns).rwsem); } /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { for (;;) { struct shmid_kernel *shp; struct ipc_namespace *ns; task_lock(task); if (list_empty(&task->sysvshm.shm_clist)) { task_unlock(task); break; } shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, shm_clist); /* * 1) Get pointer to the ipc namespace. It is worth to say * that this pointer is guaranteed to be valid because * shp lifetime is always shorter than namespace lifetime * in which shp lives. * We taken task_lock it means that shp won't be freed. */ ns = shp->ns; /* * 2) If kernel.shm_rmid_forced is not set then only keep track of * which shmids are orphaned, so that a later set of the sysctl * can clean them up. */ if (!ns->shm_rmid_forced) goto unlink_continue; /* * 3) get a reference to the namespace. * The refcount could be already 0. If it is 0, then * the shm objects will be free by free_ipc_work(). */ ns = get_ipc_ns_not_zero(ns); if (!ns) { unlink_continue: list_del_init(&shp->shm_clist); task_unlock(task); continue; } /* * 4) get a reference to shp. * This cannot fail: shm_clist_rm() is called before * ipc_rmid(), thus the refcount cannot be 0. */ WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); /* * 5) unlink the shm segment from the list of segments * created by current. * This must be done last. After unlinking, * only the refcounts obtained above prevent IPC_RMID * from destroying the segment or the namespace. */ list_del_init(&shp->shm_clist); task_unlock(task); /* * 6) we have all references * Thus lock & if needed destroy shp. */ down_write(&shm_ids(ns).rwsem); shm_lock_by_ptr(shp); /* * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's * safe to call ipc_rcu_putref here */ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); if (ipc_valid_object(&shp->shm_perm)) { if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); } else { /* * Someone else deleted the shp from namespace * idr/kht while we have waited. * Just unlock and continue. */ shm_unlock(shp); } up_write(&shm_ids(ns).rwsem); put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ } } static vm_fault_t shm_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); return sfd->vm_ops->fault(vmf); } static int shm_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); if (sfd->vm_ops->may_split) return sfd->vm_ops->may_split(vma, addr); return 0; } static unsigned long shm_pagesize(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); if (sfd->vm_ops->pagesize) return sfd->vm_ops->pagesize(vma); return PAGE_SIZE; } #ifdef CONFIG_NUMA static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct shm_file_data *sfd = shm_file_data(vma->vm_file); struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) mpol = sfd->vm_ops->get_policy(vma, addr, ilx); return mpol; } #endif static int shm_mmap(struct file *file, struct vm_area_struct *vma) { struct shm_file_data *sfd = shm_file_data(file); int ret; /* * In case of remap_file_pages() emulation, the file can represent an * IPC ID that was removed, and possibly even reused by another shm * segment already. Propagate this case as an error to caller. */ ret = __shm_open(sfd); if (ret) return ret; ret = vfs_mmap(sfd->file, vma); if (ret) { __shm_close(sfd); return ret; } sfd->vm_ops = vma->vm_ops; #ifdef CONFIG_MMU WARN_ON(!sfd->vm_ops->fault); #endif vma->vm_ops = &shm_vm_ops; return 0; } static int shm_release(struct inode *ino, struct file *file) { struct shm_file_data *sfd = shm_file_data(file); put_ipc_ns(sfd->ns); fput(sfd->file); shm_file_data(file) = NULL; kfree(sfd); return 0; } static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct shm_file_data *sfd = shm_file_data(file); if (!sfd->file->f_op->fsync) return -EINVAL; return sfd->file->f_op->fsync(sfd->file, start, end, datasync); } static long shm_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct shm_file_data *sfd = shm_file_data(file); if (!sfd->file->f_op->fallocate) return -EOPNOTSUPP; return sfd->file->f_op->fallocate(file, mode, offset, len); } static unsigned long shm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct shm_file_data *sfd = shm_file_data(file); return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, pgoff, flags); } static const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, }; /* * shm_file_operations_huge is now identical to shm_file_operations * except for fop_flags */ static const struct file_operations shm_file_operations_huge = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, .fop_flags = FOP_HUGE_PAGES, }; static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ .fault = shm_fault, .may_split = shm_may_split, .pagesize = shm_pagesize, #if defined(CONFIG_NUMA) .set_policy = shm_set_policy, .get_policy = shm_get_policy, #endif }; /** * newseg - Create a new shared memory segment * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * * Called with shm_ids.rwsem held as a writer. */ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) { key_t key = params->key; int shmflg = params->flg; size_t size = params->u.size; int error; struct shmid_kernel *shp; size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; struct file *file; char name[13]; vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; if (numpages << PAGE_SHIFT < size) return -ENOSPC; if (ns->shm_tot + numpages < ns->shm_tot || ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT); if (unlikely(!shp)) return -ENOMEM; shp->shm_perm.key = key; shp->shm_perm.mode = (shmflg & S_IRWXUGO); shp->mlock_ucounts = NULL; shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); if (error) { kfree(shp); return error; } sprintf(name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { struct hstate *hs; size_t hugesize; hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) { error = -EINVAL; goto no_file; } hugesize = ALIGN(size, huge_page_size(hs)); /* hugetlb_file_setup applies strict accounting */ if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, hugesize, acctflag, HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even * if it's asked for. */ if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) acctflag = VM_NORESERVE; file = shmem_kernel_file_setup(name, size, acctflag); } error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; shp->shm_cprid = get_pid(task_tgid(current)); shp->shm_lprid = NULL; shp->shm_atim = shp->shm_dtim = 0; shp->shm_ctim = ktime_get_real_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; /* ipc_addid() locks shp upon success. */ error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (error < 0) goto no_id; shp->ns = ns; task_lock(current); list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); task_unlock(current); /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. */ file_inode(file)->i_ino = shp->shm_perm.id; ns->shm_tot += numpages; error = shp->shm_perm.id; ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); return error; no_id: ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); fput(file); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); return error; no_file: call_rcu(&shp->shm_perm.rcu, shm_rcu_free); return error; } /* * Called with shm_ids.rwsem and ipcp locked. */ static int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); if (shp->shm_segsz < params->u.size) return -EINVAL; return 0; } long ksys_shmget(key_t key, size_t size, int shmflg) { struct ipc_namespace *ns; static const struct ipc_ops shm_ops = { .getnew = newseg, .associate = security_shm_associate, .more_checks = shm_more_checks, }; struct ipc_params shm_params; ns = current->nsproxy->ipc_ns; shm_params.key = key; shm_params.flg = shmflg; shm_params.u.size = size; return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); } SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) { return ksys_shmget(key, size, shmflg); } static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct shmid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); out.shm_segsz = in->shm_segsz; out.shm_atime = in->shm_atime; out.shm_dtime = in->shm_dtime; out.shm_ctime = in->shm_ctime; out.shm_cpid = in->shm_cpid; out.shm_lpid = in->shm_lpid; out.shm_nattch = in->shm_nattch; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static inline unsigned long copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct shmid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->shm_perm.uid = tbuf_old.shm_perm.uid; out->shm_perm.gid = tbuf_old.shm_perm.gid; out->shm_perm.mode = tbuf_old.shm_perm.mode; return 0; } default: return -EINVAL; } } static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct shminfo out; if (in->shmmax > INT_MAX) out.shmmax = INT_MAX; else out.shmmax = (int)in->shmmax; out.shmmin = in->shmmin; out.shmmni = in->shmmni; out.shmseg = in->shmseg; out.shmall = in->shmall; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } /* * Calculate and add used RSS and swap pages of a shm. * Called with shm_ids.rwsem held as a reader */ static void shm_add_rss_swap(struct shmid_kernel *shp, unsigned long *rss_add, unsigned long *swp_add) { struct inode *inode; inode = file_inode(shp->shm_file); if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_file(shp->shm_file); *rss_add += pages_per_huge_page(h) * mapping->nrpages; } else { #ifdef CONFIG_SHMEM struct shmem_inode_info *info = SHMEM_I(inode); spin_lock_irq(&info->lock); *rss_add += inode->i_mapping->nrpages; *swp_add += info->swapped; spin_unlock_irq(&info->lock); #else *rss_add += inode->i_mapping->nrpages; #endif } } /* * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) { int next_id; int total, in_use; *rss = 0; *swp = 0; in_use = shm_ids(ns).in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { struct kern_ipc_perm *ipc; struct shmid_kernel *shp; ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); if (ipc == NULL) continue; shp = container_of(ipc, struct shmid_kernel, shm_perm); shm_add_rss_swap(shp, rss, swp); total++; } } /* * This function handles some shmctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, struct shmid64_ds *shmid64) { struct kern_ipc_perm *ipcp; struct shmid_kernel *shp; int err; down_write(&shm_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd, &shmid64->shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } shp = container_of(ipcp, struct shmid_kernel, shm_perm); err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: ipc_lock_object(&shp->shm_perm); /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: ipc_lock_object(&shp->shm_perm); err = ipc_update_perm(&shmid64->shm_perm, ipcp); if (err) goto out_unlock0; shp->shm_ctim = ktime_get_real_seconds(); break; default: err = -EINVAL; goto out_unlock1; } out_unlock0: ipc_unlock_object(&shp->shm_perm); out_unlock1: rcu_read_unlock(); out_up: up_write(&shm_ids(ns).rwsem); return err; } static int shmctl_ipc_info(struct ipc_namespace *ns, struct shminfo64 *shminfo) { int err = security_shm_shmctl(NULL, IPC_INFO); if (!err) { memset(shminfo, 0, sizeof(*shminfo)); shminfo->shmmni = shminfo->shmseg = ns->shm_ctlmni; shminfo->shmmax = ns->shm_ctlmax; shminfo->shmall = ns->shm_ctlall; shminfo->shmmin = SHMMIN; down_read(&shm_ids(ns).rwsem); err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; } return err; } static int shmctl_shm_info(struct ipc_namespace *ns, struct shm_info *shm_info) { int err = security_shm_shmctl(NULL, SHM_INFO); if (!err) { memset(shm_info, 0, sizeof(*shm_info)); down_read(&shm_ids(ns).rwsem); shm_info->used_ids = shm_ids(ns).in_use; shm_get_stat(ns, &shm_info->shm_rss, &shm_info->shm_swp); shm_info->shm_tot = ns->shm_tot; shm_info->swap_attempts = 0; shm_info->swap_successes = 0; err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; } return err; } static int shmctl_stat(struct ipc_namespace *ns, int shmid, int cmd, struct shmid64_ds *tbuf) { struct shmid_kernel *shp; int err; memset(tbuf, 0, sizeof(*tbuf)); rcu_read_lock(); if (cmd == SHM_STAT || cmd == SHM_STAT_ANY) { shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } } else { /* IPC_STAT */ shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } } /* * Semantically SHM_STAT_ANY ought to be identical to * that functionality provided by the /proc/sysvipc/ * interface. As such, only audit these calls and * do not do traditional S_IRUGO permission checks on * the ipc object. */ if (cmd == SHM_STAT_ANY) audit_ipc_obj(&shp->shm_perm); else { err = -EACCES; if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; } err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&shp->shm_perm); if (!ipc_valid_object(&shp->shm_perm)) { ipc_unlock_object(&shp->shm_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&shp->shm_perm, &tbuf->shm_perm); tbuf->shm_segsz = shp->shm_segsz; tbuf->shm_atime = shp->shm_atim; tbuf->shm_dtime = shp->shm_dtim; tbuf->shm_ctime = shp->shm_ctim; #ifndef CONFIG_64BIT tbuf->shm_atime_high = shp->shm_atim >> 32; tbuf->shm_dtime_high = shp->shm_dtim >> 32; tbuf->shm_ctime_high = shp->shm_ctim >> 32; #endif tbuf->shm_cpid = pid_vnr(shp->shm_cprid); tbuf->shm_lpid = pid_vnr(shp->shm_lprid); tbuf->shm_nattch = shp->shm_nattch; if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * SHM_STAT and SHM_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = shp->shm_perm.id; } ipc_unlock_object(&shp->shm_perm); out_unlock: rcu_read_unlock(); return err; } static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) { struct shmid_kernel *shp; struct file *shm_file; int err; rcu_read_lock(); shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock1; } audit_ipc_obj(&(shp->shm_perm)); err = security_shm_shmctl(&shp->shm_perm, cmd); if (err) goto out_unlock1; ipc_lock_object(&shp->shm_perm); /* check if shm_destroy() is tearing down shp */ if (!ipc_valid_object(&shp->shm_perm)) { err = -EIDRM; goto out_unlock0; } if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); if (!uid_eq(euid, shp->shm_perm.uid) && !uid_eq(euid, shp->shm_perm.cuid)) { err = -EPERM; goto out_unlock0; } if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) { err = -EPERM; goto out_unlock0; } } shm_file = shp->shm_file; if (is_file_hugepages(shm_file)) goto out_unlock0; if (cmd == SHM_LOCK) { struct ucounts *ucounts = current_ucounts(); err = shmem_lock(shm_file, 1, ucounts); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_ucounts = ucounts; } goto out_unlock0; } /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) goto out_unlock0; shmem_lock(shm_file, 0, shp->mlock_ucounts); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_ucounts = NULL; get_file(shm_file); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); shmem_unlock_mapping(shm_file->f_mapping); fput(shm_file); return err; out_unlock0: ipc_unlock_object(&shp->shm_perm); out_unlock1: rcu_read_unlock(); return err; } static long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf, int version) { int err; struct ipc_namespace *ns; struct shmid64_ds sem64; if (cmd < 0 || shmid < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; err = shmctl_ipc_info(ns, &shminfo); if (err < 0) return err; if (copy_shminfo_to_user(buf, &shminfo, version)) err = -EFAULT; return err; } case SHM_INFO: { struct shm_info shm_info; err = shmctl_shm_info(ns, &shm_info); if (err < 0) return err; if (copy_to_user(buf, &shm_info, sizeof(shm_info))) err = -EFAULT; return err; } case SHM_STAT: case SHM_STAT_ANY: case IPC_STAT: { err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) return err; if (copy_shmid_to_user(buf, &sem64, version)) err = -EFAULT; return err; } case IPC_SET: if (copy_shmid_from_user(&sem64, buf, version)) return -EFAULT; fallthrough; case IPC_RMID: return shmctl_down(ns, shmid, cmd, &sem64); case SHM_LOCK: case SHM_UNLOCK: return shmctl_do_lock(ns, shmid, cmd); default: return -EINVAL; } } SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { return ksys_shmctl(shmid, cmd, buf, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) { int version = ipc_parse_version(&cmd); return ksys_shmctl(shmid, cmd, buf, version); } SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { return ksys_old_shmctl(shmid, cmd, buf); } #endif #ifdef CONFIG_COMPAT struct compat_shmid_ds { struct compat_ipc_perm shm_perm; int shm_segsz; old_time32_t shm_atime; old_time32_t shm_dtime; old_time32_t shm_ctime; compat_ipc_pid_t shm_cpid; compat_ipc_pid_t shm_lpid; unsigned short shm_nattch; unsigned short shm_unused; compat_uptr_t shm_unused2; compat_uptr_t shm_unused3; }; struct compat_shminfo64 { compat_ulong_t shmmax; compat_ulong_t shmmin; compat_ulong_t shmmni; compat_ulong_t shmseg; compat_ulong_t shmall; compat_ulong_t __unused1; compat_ulong_t __unused2; compat_ulong_t __unused3; compat_ulong_t __unused4; }; struct compat_shm_info { compat_int_t used_ids; compat_ulong_t shm_tot, shm_rss, shm_swp; compat_ulong_t swap_attempts, swap_successes; }; static int copy_compat_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) { if (in->shmmax > INT_MAX) in->shmmax = INT_MAX; if (version == IPC_64) { struct compat_shminfo64 info; memset(&info, 0, sizeof(info)); info.shmmax = in->shmmax; info.shmmin = in->shmmin; info.shmmni = in->shmmni; info.shmseg = in->shmseg; info.shmall = in->shmall; return copy_to_user(buf, &info, sizeof(info)); } else { struct shminfo info; memset(&info, 0, sizeof(info)); info.shmmax = in->shmmax; info.shmmin = in->shmmin; info.shmmni = in->shmmni; info.shmseg = in->shmseg; info.shmall = in->shmall; return copy_to_user(buf, &info, sizeof(info)); } } static int put_compat_shm_info(struct shm_info *ip, struct compat_shm_info __user *uip) { struct compat_shm_info info; memset(&info, 0, sizeof(info)); info.used_ids = ip->used_ids; info.shm_tot = ip->shm_tot; info.shm_rss = ip->shm_rss; info.shm_swp = ip->shm_swp; info.swap_attempts = ip->swap_attempts; info.swap_successes = ip->swap_successes; return copy_to_user(uip, &info, sizeof(info)); } static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) { if (version == IPC_64) { struct compat_shmid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.shm_perm, &in->shm_perm); v.shm_atime = lower_32_bits(in->shm_atime); v.shm_atime_high = upper_32_bits(in->shm_atime); v.shm_dtime = lower_32_bits(in->shm_dtime); v.shm_dtime_high = upper_32_bits(in->shm_dtime); v.shm_ctime = lower_32_bits(in->shm_ctime); v.shm_ctime_high = upper_32_bits(in->shm_ctime); v.shm_segsz = in->shm_segsz; v.shm_nattch = in->shm_nattch; v.shm_cpid = in->shm_cpid; v.shm_lpid = in->shm_lpid; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_shmid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.shm_perm, &in->shm_perm); v.shm_perm.key = in->shm_perm.key; v.shm_atime = in->shm_atime; v.shm_dtime = in->shm_dtime; v.shm_ctime = in->shm_ctime; v.shm_segsz = in->shm_segsz; v.shm_nattch = in->shm_nattch; v.shm_cpid = in->shm_cpid; v.shm_lpid = in->shm_lpid; return copy_to_user(buf, &v, sizeof(v)); } } static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_shmid64_ds __user *p = buf; return get_compat_ipc64_perm(&out->shm_perm, &p->shm_perm); } else { struct compat_shmid_ds __user *p = buf; return get_compat_ipc_perm(&out->shm_perm, &p->shm_perm); } } static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; struct shmid64_ds sem64; int err; ns = current->nsproxy->ipc_ns; if (cmd < 0 || shmid < 0) return -EINVAL; switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; err = shmctl_ipc_info(ns, &shminfo); if (err < 0) return err; if (copy_compat_shminfo_to_user(uptr, &shminfo, version)) err = -EFAULT; return err; } case SHM_INFO: { struct shm_info shm_info; err = shmctl_shm_info(ns, &shm_info); if (err < 0) return err; if (put_compat_shm_info(&shm_info, uptr)) err = -EFAULT; return err; } case IPC_STAT: case SHM_STAT_ANY: case SHM_STAT: err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) return err; if (copy_compat_shmid_to_user(uptr, &sem64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_compat_shmid_from_user(&sem64, uptr, version)) return -EFAULT; fallthrough; case IPC_RMID: return shmctl_down(ns, shmid, cmd, &sem64); case SHM_LOCK: case SHM_UNLOCK: return shmctl_do_lock(ns, shmid, cmd); default: return -EINVAL; } return err; } COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr) { return compat_ksys_shmctl(shmid, cmd, uptr, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_shmctl(shmid, cmd, uptr, version); } COMPAT_SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, void __user *, uptr) { return compat_ksys_old_shmctl(shmid, cmd, uptr); } #endif #endif /* * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. * * NOTE! Despite the name, this is NOT a direct system call entrypoint. The * "raddr" thing points to kernel space, and there has to be a wrapper around * this. */ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long shmlba) { struct shmid_kernel *shp; unsigned long addr = (unsigned long)shmaddr; unsigned long size; struct file *file, *base; int err; unsigned long flags = MAP_SHARED; unsigned long prot; int acc_mode; struct ipc_namespace *ns; struct shm_file_data *sfd; int f_flags; unsigned long populate = 0; err = -EINVAL; if (shmid < 0) goto out; if (addr) { if (addr & (shmlba - 1)) { if (shmflg & SHM_RND) { addr &= ~(shmlba - 1); /* round down */ /* * Ensure that the round-down is non-nil * when remapping. This can happen for * cases when addr < shmlba. */ if (!addr && (shmflg & SHM_REMAP)) goto out; } else #ifndef __ARCH_FORCE_SHMLBA if (addr & ~PAGE_MASK) #endif goto out; } flags |= MAP_FIXED; } else if ((shmflg & SHM_REMAP)) goto out; if (shmflg & SHM_RDONLY) { prot = PROT_READ; acc_mode = S_IRUGO; f_flags = O_RDONLY; } else { prot = PROT_READ | PROT_WRITE; acc_mode = S_IRUGO | S_IWUGO; f_flags = O_RDWR; } if (shmflg & SHM_EXEC) { prot |= PROT_EXEC; acc_mode |= S_IXUGO; } /* * We cannot rely on the fs check since SYSV IPC does have an * additional creator id... */ ns = current->nsproxy->ipc_ns; rcu_read_lock(); shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); goto out_unlock; } err = -EACCES; if (ipcperms(ns, &shp->shm_perm, acc_mode)) goto out_unlock; err = security_shm_shmat(&shp->shm_perm, shmaddr, shmflg); if (err) goto out_unlock; ipc_lock_object(&shp->shm_perm); /* check if shm_destroy() is tearing down shp */ if (!ipc_valid_object(&shp->shm_perm)) { ipc_unlock_object(&shp->shm_perm); err = -EIDRM; goto out_unlock; } /* * We need to take a reference to the real shm file to prevent the * pointer from becoming stale in cases where the lifetime of the outer * file extends beyond that of the shm segment. It's not usually * possible, but it can happen during remap_file_pages() emulation as * that unmaps the memory, then does ->mmap() via file reference only. * We'll deny the ->mmap() if the shm segment was since removed, but to * detect shm ID reuse we need to compare the file pointers. */ base = get_file(shp->shm_file); shp->shm_nattch++; size = i_size_read(file_inode(base)); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); if (!sfd) { fput(base); goto out_nattch; } file = alloc_file_clone(base, f_flags, is_file_hugepages(base) ? &shm_file_operations_huge : &shm_file_operations); err = PTR_ERR(file); if (IS_ERR(file)) { kfree(sfd); fput(base); goto out_nattch; } sfd->id = shp->shm_perm.id; sfd->ns = get_ipc_ns(ns); sfd->file = base; sfd->vm_ops = NULL; file->private_data = sfd; err = security_mmap_file(file, prot, flags); if (err) goto out_fput; if (mmap_write_lock_killable(current->mm)) { err = -EINTR; goto out_fput; } if (addr && !(shmflg & SHM_REMAP)) { err = -EINVAL; if (addr + size < addr) goto invalid; if (find_vma_intersection(current->mm, addr, addr + size)) goto invalid; } addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL); *raddr = addr; err = 0; if (IS_ERR_VALUE(addr)) err = (long)addr; invalid: mmap_write_unlock(current->mm); if (populate) mm_populate(addr, populate); out_fput: fput(file); out_nattch: down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); up_write(&shm_ids(ns).rwsem); return err; out_unlock: rcu_read_unlock(); out: return err; } SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) { unsigned long ret; long err; err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); if (err) return err; force_successful_syscall_return(); return (long)ret; } #ifdef CONFIG_COMPAT #ifndef COMPAT_SHMLBA #define COMPAT_SHMLBA SHMLBA #endif COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg) { unsigned long ret; long err; err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret, COMPAT_SHMLBA); if (err) return err; force_successful_syscall_return(); return (long)ret; } #endif /* * detach and kill segment if marked destroyed. * The work is done in shm_close. */ long ksys_shmdt(char __user *shmaddr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr = (unsigned long)shmaddr; int retval = -EINVAL; #ifdef CONFIG_MMU loff_t size = 0; struct file *file; VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) return retval; if (mmap_write_lock_killable(mm)) return -EINTR; /* * This function tries to be smart and unmap shm segments that * were modified by partial mlock or munmap calls: * - It first determines the size of the shm segment that should be * unmapped: It searches for a vma that is backed by shm and that * started at address shmaddr. It records it's size and then unmaps * it. * - Then it unmaps all shm vmas that started at shmaddr and that * are within the initially determined size and that are from the * same shm segment from which we determined the size. * Errors from do_munmap are ignored: the function only fails if * it's called with invalid parameters or if it's called to unmap * a part of a vma. Both calls in this function are for full vmas, * the parameters are directly copied from the vma itself and always * valid - therefore do_munmap cannot fail. (famous last words?) */ /* * If it had been mremap()'d, the starting address would not * match the usual checks anyway. So assume all vma's are * above the starting address given. */ #ifdef CONFIG_MMU for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it * otherwise it starts at this address with no hassles. */ if ((vma->vm_ops == &shm_vm_ops) && (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { /* * Record the file of the shm segment being * unmapped. With mremap(), someone could place * page from another segment but with equal offsets * in the range we are unmapping. */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start, vma->vm_end, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next * loop that uses the size information to stop * searching for matching vma's. */ retval = 0; vma = vma_next(&vmi); break; } } /* * We need look no further than the maximum address a fragment * could possibly have landed at. Also cast things to loff_t to * prevent overflows and make comparisons vs. equal-width types. */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && (vma->vm_file == file)) { do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start, vma->vm_end, NULL, false); } vma = vma_next(&vmi); } #else /* CONFIG_MMU */ vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); retval = 0; } #endif mmap_write_unlock(mm); return retval; } SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) { return ksys_shmdt(shmaddr); } #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it) { struct pid_namespace *pid_ns = ipc_seq_pid_ns(s); struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct shmid_kernel *shp; unsigned long rss = 0, swp = 0; shp = container_of(ipcp, struct shmid_kernel, shm_perm); shm_add_rss_swap(shp, &rss, &swp); #if BITS_PER_LONG <= 32 #define SIZE_SPEC "%10lu" #else #define SIZE_SPEC "%21lu" #endif seq_printf(s, "%10d %10d %4o " SIZE_SPEC " %5u %5u " "%5lu %5u %5u %5u %5u %10llu %10llu %10llu " SIZE_SPEC " " SIZE_SPEC "\n", shp->shm_perm.key, shp->shm_perm.id, shp->shm_perm.mode, shp->shm_segsz, pid_nr_ns(shp->shm_cprid, pid_ns), pid_nr_ns(shp->shm_lprid, pid_ns), shp->shm_nattch, from_kuid_munged(user_ns, shp->shm_perm.uid), from_kgid_munged(user_ns, shp->shm_perm.gid), from_kuid_munged(user_ns, shp->shm_perm.cuid), from_kgid_munged(user_ns, shp->shm_perm.cgid), shp->shm_atim, shp->shm_dtim, shp->shm_ctim, rss * PAGE_SIZE, swp * PAGE_SIZE); return 0; } #endif |
| 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Deflate algorithm (RFC 1951), implemented here primarily for use * by IPCOMP (RFC 3173 & RFC 2394). * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * Copyright (c) 2023 Google, LLC. <ardb@kernel.org> * Copyright (c) 2025 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/acompress.h> #include <crypto/scatterwalk.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zlib.h> #define DEFLATE_DEF_LEVEL Z_DEFAULT_COMPRESSION #define DEFLATE_DEF_WINBITS 11 #define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL struct deflate_stream { struct z_stream_s stream; u8 workspace[]; }; static DEFINE_MUTEX(deflate_stream_lock); static void *deflate_alloc_stream(void) { size_t size = max(zlib_inflate_workspacesize(), zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL)); struct deflate_stream *ctx; ctx = kvmalloc(sizeof(*ctx) + size, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); ctx->stream.workspace = ctx->workspace; return ctx; } static void deflate_free_stream(void *ctx) { kvfree(ctx); } static struct crypto_acomp_streams deflate_streams = { .alloc_ctx = deflate_alloc_stream, .free_ctx = deflate_free_stream, }; static int deflate_compress_one(struct acomp_req *req, struct deflate_stream *ds) { struct z_stream_s *stream = &ds->stream; struct acomp_walk walk; int ret; ret = acomp_walk_virt(&walk, req, true); if (ret) return ret; do { unsigned int dcur; dcur = acomp_walk_next_dst(&walk); if (!dcur) return -ENOSPC; stream->avail_out = dcur; stream->next_out = walk.dst.virt.addr; do { int flush = Z_FINISH; unsigned int scur; stream->avail_in = 0; stream->next_in = NULL; scur = acomp_walk_next_src(&walk); if (scur) { if (acomp_walk_more_src(&walk, scur)) flush = Z_NO_FLUSH; stream->avail_in = scur; stream->next_in = walk.src.virt.addr; } ret = zlib_deflate(stream, flush); if (scur) { scur -= stream->avail_in; acomp_walk_done_src(&walk, scur); } } while (ret == Z_OK && stream->avail_out); acomp_walk_done_dst(&walk, dcur); } while (ret == Z_OK); if (ret != Z_STREAM_END) return -EINVAL; req->dlen = stream->total_out; return 0; } static int deflate_compress(struct acomp_req *req) { struct crypto_acomp_stream *s; struct deflate_stream *ds; int err; s = crypto_acomp_lock_stream_bh(&deflate_streams); ds = s->ctx; err = zlib_deflateInit2(&ds->stream, DEFLATE_DEF_LEVEL, Z_DEFLATED, -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL, Z_DEFAULT_STRATEGY); if (err != Z_OK) { err = -EINVAL; goto out; } err = deflate_compress_one(req, ds); out: crypto_acomp_unlock_stream_bh(s); return err; } static int deflate_decompress_one(struct acomp_req *req, struct deflate_stream *ds) { struct z_stream_s *stream = &ds->stream; bool out_of_space = false; struct acomp_walk walk; int ret; ret = acomp_walk_virt(&walk, req, true); if (ret) return ret; do { unsigned int scur; stream->avail_in = 0; stream->next_in = NULL; scur = acomp_walk_next_src(&walk); if (scur) { stream->avail_in = scur; stream->next_in = walk.src.virt.addr; } do { unsigned int dcur; dcur = acomp_walk_next_dst(&walk); if (!dcur) { out_of_space = true; break; } stream->avail_out = dcur; stream->next_out = walk.dst.virt.addr; ret = zlib_inflate(stream, Z_NO_FLUSH); dcur -= stream->avail_out; acomp_walk_done_dst(&walk, dcur); } while (ret == Z_OK && stream->avail_in); if (scur) acomp_walk_done_src(&walk, scur); if (out_of_space) return -ENOSPC; } while (ret == Z_OK); if (ret != Z_STREAM_END) return -EINVAL; req->dlen = stream->total_out; return 0; } static int deflate_decompress(struct acomp_req *req) { struct crypto_acomp_stream *s; struct deflate_stream *ds; int err; s = crypto_acomp_lock_stream_bh(&deflate_streams); ds = s->ctx; err = zlib_inflateInit2(&ds->stream, -DEFLATE_DEF_WINBITS); if (err != Z_OK) { err = -EINVAL; goto out; } err = deflate_decompress_one(req, ds); out: crypto_acomp_unlock_stream_bh(s); return err; } static int deflate_init(struct crypto_acomp *tfm) { int ret; mutex_lock(&deflate_stream_lock); ret = crypto_acomp_alloc_streams(&deflate_streams); mutex_unlock(&deflate_stream_lock); return ret; } static struct acomp_alg acomp = { .compress = deflate_compress, .decompress = deflate_decompress, .init = deflate_init, .base.cra_name = "deflate", .base.cra_driver_name = "deflate-generic", .base.cra_flags = CRYPTO_ALG_REQ_VIRT, .base.cra_module = THIS_MODULE, }; static int __init deflate_mod_init(void) { return crypto_register_acomp(&acomp); } static void __exit deflate_mod_fini(void) { crypto_unregister_acomp(&acomp); crypto_acomp_free_streams(&deflate_streams); } module_init(deflate_mod_init); module_exit(deflate_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>"); MODULE_AUTHOR("Herbert Xu <herbert@gondor.apana.org.au>"); MODULE_ALIAS_CRYPTO("deflate"); MODULE_ALIAS_CRYPTO("deflate-generic"); |
| 4 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ip6t_entry *e = par->entryinfo; int err; if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv6_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, .destroy = synproxy_tg6_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg6_init(void) { return xt_register_target(&synproxy_tg6_reg); } static void __exit synproxy_tg6_exit(void) { xt_unregister_target(&synproxy_tg6_reg); } module_init(synproxy_tg6_init); module_exit(synproxy_tg6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies"); |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for some sunplus "special" devices * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby */ /* */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" static const __u8 *sp_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { if (*rsize >= 112 && rdesc[104] == 0x26 && rdesc[105] == 0x80 && rdesc[106] == 0x03) { hid_info(hdev, "fixing up Sunplus Wireless Desktop report descriptor\n"); rdesc[105] = rdesc[110] = 0x03; rdesc[106] = rdesc[111] = 0x21; } return rdesc; } #define sp_map_key_clear(c) hid_map_usage_clear(hi, usage, bit, max, \ EV_KEY, (c)) static int sp_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { if ((usage->hid & HID_USAGE_PAGE) != HID_UP_CONSUMER) return 0; switch (usage->hid & HID_USAGE) { case 0x2003: sp_map_key_clear(KEY_ZOOMIN); break; case 0x2103: sp_map_key_clear(KEY_ZOOMOUT); break; default: return 0; } return 1; } static const struct hid_device_id sp_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SUNPLUS, USB_DEVICE_ID_SUNPLUS_WDESKTOP) }, { } }; MODULE_DEVICE_TABLE(hid, sp_devices); static struct hid_driver sp_driver = { .name = "sunplus", .id_table = sp_devices, .report_fixup = sp_report_fixup, .input_mapping = sp_input_mapping, }; module_hid_driver(sp_driver); MODULE_DESCRIPTION("HID driver for some sunplus \"special\" devices"); MODULE_LICENSE("GPL"); |
| 1 220 220 206 200 4 4 301 301 301 301 301 301 301 301 208 197 259 259 259 259 259 259 259 259 259 259 119 120 120 4 4 4 4 4 4 4 4 4 128 129 129 129 15 129 2356 2355 20 101 103 103 1120 1120 1120 1120 1101 65 1120 363 37 1120 89 89 89 19 85 197 197 180 130 1 197 2 197 197 3 13 13 10 13 13 170 99 121 186 186 186 186 186 109 172 129 43 172 118 172 172 172 109 143 143 143 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" #include "swap_table.h" #include "swap.h" /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif }; /* Set swap_space as read only as swap cache is handled by swap table */ struct address_space swap_space __ro_after_init = { .a_ops = &swap_aops, }; static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); printk("Total swap = %lukB\n", K(total_swap_pages)); } /** * swap_cache_get_folio - Looks up a folio in the swap cache. * @entry: swap entry used for the lookup. * * A found folio will be returned unlocked and with its refcount increased. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller * must lock nd check if the folio still matches the swap entry before * use (e.g., folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) { unsigned long swp_tb; struct folio *folio; for (;;) { swp_tb = swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); if (!swp_tb_is_folio(swp_tb)) return NULL; folio = swp_tb_to_folio(swp_tb); if (likely(folio_try_get(folio))) return folio; } return NULL; } /** * swap_cache_get_shadow - Looks up a shadow in the swap cache. * @entry: swap entry used for the lookup. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns either NULL or an XA_VALUE (shadow). */ void *swap_cache_get_shadow(swp_entry_t entry) { unsigned long swp_tb; swp_tb = swap_table_get(__swap_entry_to_cluster(entry), swp_cluster_offset(entry)); if (swp_tb_is_shadow(swp_tb)) return swp_tb_to_shadow(swp_tb); return NULL; } /** * swap_cache_add_folio - Add a folio into the swap cache. * @folio: The folio to be added. * @entry: The swap entry corresponding to the folio. * @gfp: gfp_mask for XArray node allocation. * @shadowp: If a shadow is found, return the shadow. * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * The caller also needs to update the corresponding swap_map slots with * SWAP_HAS_CACHE bit to avoid race or conflict. */ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) { void *shadow = NULL; unsigned long old_tb, new_tb; struct swap_cluster_info *ci; unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); new_tb = folio_to_swp_tb(folio); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); do { old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(swp_tb_is_folio(old_tb)); if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; swap_cluster_unlock(ci); node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); if (shadowp) *shadowp = shadow; } /** * __swap_cache_del_folio - Removes a folio from the swap cache. * @ci: The locked swap cluster. * @folio: The folio. * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. * * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * * Context: Caller must ensure the folio is locked and in the swap cache * using the index of @entry, and lock the cluster that holds the entries. */ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); new_tb = shadow_swp_to_tb(shadow); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; do { /* If shadow is NULL, we sets an empty shadow */ old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } /** * swap_cache_del_folio - Removes a folio from the swap cache. * @folio: The folio. * * Same as __swap_cache_del_folio, but handles lock and refcount. The * caller must ensure the folio is either clean or has a swap count * equal to zero, or it may cause data loss. * * Context: Caller must ensure the folio is locked and in the swap cache. */ void swap_cache_del_folio(struct folio *folio) { struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); __swap_cache_del_folio(ci, folio, entry, NULL); swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } /** * __swap_cache_replace_folio - Replace a folio in the swap cache. * @ci: The locked swap cluster. * @old: The old folio to be replaced. * @new: The new folio. * * Replace an existing folio in the swap cache with a new folio. The * caller is responsible for setting up the new folio's flag and swap * entries. Replacement will take the new folio's swap entry value as * the starting offset to override all slots covered by the new folio. * * Context: Caller must ensure both folios are locked, and lock the * cluster that holds the old folio to be replaced. */ void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new) { swp_entry_t entry = new->swap; unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; unsigned long old_tb, new_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ new_tb = folio_to_swp_tb(new); do { old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); } while (++ci_off < ci_end); /* * If the old folio is partially replaced (e.g., splitting a large * folio, the old folio is shrunk, and new split sub folios replace * the shrunk part), ensure the new folio doesn't overlap it. */ if (IS_ENABLED(CONFIG_DEBUG_VM) && folio_order(old) != folio_order(new)) { ci_off = swp_cluster_offset(old->swap); ci_end = ci_off + folio_nr_pages(old); while (ci_off++ < ci_end) WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); } } /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. * @entry: The starting index entry. * @nr_ents: How many slots need to be cleared. * * Context: Caller must ensure the range is valid, all in one single cluster, * not occupied by any folio, and lock the cluster. */ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) { struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long old; ci_end = ci_off + nr_ents; do { old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); WARN_ON_ONCE(swp_tb_is_folio(old)); } while (++ci_off < ci_end); } /* * If we are the only user, then try to free up the swap cache. * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct folio *folio) { if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); folio_unlock(folio); } } /* * Freeing a folio and also freeing any swap cache associated with * this folio if it is the last user. */ void free_folio_and_swap_cache(struct folio *folio) { free_swap_cache(folio); if (!is_huge_zero_folio(folio)) folio_put(folio); } /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); free_swap_cache(folio); refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[folios.nr] = encoded_nr_pages(pages[++i]); if (folio_batch_add(&folios, folio) == 0) folios_put_refs(&folios, refs); } if (folios.nr) folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } /** * swap_update_readahead - Update the readahead statistics of VMA or globally. * @folio: the swap cache folio that just got hit. * @vma: the VMA that should be updated, could be NULL for global update. * @addr: the addr that triggered the swapin, ignored if @vma is NULL. */ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { bool readahead, vma_ra = swap_use_vma_readahead(); /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ if (unlikely(folio_test_large(folio))) return; readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } if (readahead) { count_vm_event(SWAP_RA_HIT); if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; for (;;) { int err; /* * Check the swap cache first, if a cached folio is found, * return it unlocked. The caller will lock and check it. */ folio = swap_cache_get_folio(entry); if (folio) goto got_folio; /* * Just skip read ahead for unused swap slot. */ if (!swap_entry_swapped(si, entry)) goto put_and_return; /* * Get a new folio to read into from swap. Allocate it now if * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, * when -EEXIST will cause any racers to loop around until we * add it to cache. */ if (!new_folio) { new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); if (!new_folio) goto put_and_return; } /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry, 1); if (!err) break; else if (err != -EEXIST) goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() * on the same entry waiting forever here because SWAP_HAS_CACHE * is set but the folio is not the swap cache yet. This can * happen today if mem_cgroup_swapin_charge_folio() below * triggers reclaim through zswap, which may call * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) goto put_and_return; /* * We might race against __swap_cache_del_folio(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its folio to swap cache. */ schedule_timeout_uninterruptible(1); } /* * The swap entry is ours to swap in. Prepare the new folio. */ __folio_set_locked(new_folio); __folio_set_swapbacked(new_folio); if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; swap_cache_add_folio(new_folio, entry, &shadow); memcg1_swapin(entry, 1); if (shadow) workingset_refault(new_folio, shadow); /* Caller will initiate read into locked new_folio */ folio_add_lru(new_folio); *new_page_allocated = true; folio = new_folio; got_folio: result = folio; goto put_and_return; fail_unlock: put_swap_folio(new_folio, entry); folio_unlock(new_folio); put_and_return: if (!(*new_page_allocated) && new_folio) folio_put(new_folio); return result; } /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. * * get/put_swap_device() aren't needed to call this function, because * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { struct swap_info_struct *si; bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; si = get_swap_device(entry); if (!si) return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) swap_read_folio(folio, plug); put_swap_device(si); return folio; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) { unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; } else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, atomic_read(&last_readahead_pages)); if (!hits) WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; } /** * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * Note: it is intentional that the same NUMA policy and interleave index * are used for every page of the readahead: neighbouring pages on swap * are fairly likely to have been swapped out from the same node. */ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; if (end_offset >= si->max) end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ folio = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; unsigned long faddr, prev_faddr, left, right; unsigned int max_win, hits, prev_win, win; max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) return 1; faddr = vmf->address; ra_val = GET_SWAP_RA_VAL(vma); prev_faddr = SWAP_RA_ADDR(ra_val); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) return 1; if (faddr == prev_faddr + PAGE_SIZE) left = faddr; else if (prev_faddr == faddr + PAGE_SIZE) left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; else left = faddr - (((win - 1) / 2) << PAGE_SHIFT); right = left + (win << PAGE_SHIFT); if ((long)left < 0) left = 0; *start = max3(left, vma->vm_start, faddr & PMD_MASK); *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); return win; } /** * swap_vma_readahead - swap in pages in hope we need them soon * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags * @mpol: NUMA memory allocation policy to be applied * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; int win; unsigned long start, end, addr; swp_entry_t entry; pgoff_t ilx; bool page_allocated; win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) goto skip; ilx = targ_ilx - PFN_DOWN(vmf->address - start); blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) break; } pentry = ptep_get_lockless(pte); if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; pte_unmap(pte); pte = NULL; folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (!folio) continue; if (page_allocated) { swap_read_folio(folio, &splug); if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } } folio_put(folio); } if (pte) pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; } /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); folio = swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); return folio; } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &enable_vma_readahead); if (ret) return ret; return count; } static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, NULL, }; static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; static int __init swap_init(void) { int err; struct kobject *swap_kobj; swap_kobj = kobject_create_and_add("swap", mm_kobj); if (!swap_kobj) { pr_err("failed to create swap kobject\n"); return -ENOMEM; } err = sysfs_create_group(swap_kobj, &swap_attr_group); if (err) { pr_err("failed to register swap group\n"); goto delete_obj; } /* Swap cache writeback is LRU based, no tags for it */ mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } subsys_initcall(swap_init); #endif |
| 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for various HID compliant devices by ThrustMaster: * ThrustMaster FireStorm Dual Power 2 * and possibly others whose device ids haven't been added. * * Modified to support ThrustMaster devices by Zinx Verituse * on 2003-01-25 from the Logitech force feedback driver, * which is by Johann Deneux. * * Copyright (c) 2003 Zinx Verituse <zinx@epicsol.org> * Copyright (c) 2002 Johann Deneux */ /* */ #include <linux/hid.h> #include <linux/input.h> #include <linux/slab.h> #include <linux/module.h> #include "hid-ids.h" #define THRUSTMASTER_DEVICE_ID_2_IN_1_DT 0xb320 static const signed short ff_rumble[] = { FF_RUMBLE, -1 }; static const signed short ff_joystick[] = { FF_CONSTANT, -1 }; #ifdef CONFIG_THRUSTMASTER_FF /* Usages for thrustmaster devices I know about */ #define THRUSTMASTER_USAGE_FF (HID_UP_GENDESK | 0xbb) struct tmff_device { struct hid_report *report; struct hid_field *ff_field; }; /* Changes values from 0 to 0xffff into values from minimum to maximum */ static inline int tmff_scale_u16(unsigned int in, int minimum, int maximum) { int ret; ret = (in * (maximum - minimum) / 0xffff) + minimum; if (ret < minimum) return minimum; if (ret > maximum) return maximum; return ret; } /* Changes values from -0x80 to 0x7f into values from minimum to maximum */ static inline int tmff_scale_s8(int in, int minimum, int maximum) { int ret; ret = (((in + 0x80) * (maximum - minimum)) / 0xff) + minimum; if (ret < minimum) return minimum; if (ret > maximum) return maximum; return ret; } static int tmff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct tmff_device *tmff = data; struct hid_field *ff_field = tmff->ff_field; int x, y; int left, right; /* Rumbling */ switch (effect->type) { case FF_CONSTANT: x = tmff_scale_s8(effect->u.ramp.start_level, ff_field->logical_minimum, ff_field->logical_maximum); y = tmff_scale_s8(effect->u.ramp.end_level, ff_field->logical_minimum, ff_field->logical_maximum); dbg_hid("(x, y)=(%04x, %04x)\n", x, y); ff_field->value[0] = x; ff_field->value[1] = y; hid_hw_request(hid, tmff->report, HID_REQ_SET_REPORT); break; case FF_RUMBLE: left = tmff_scale_u16(effect->u.rumble.weak_magnitude, ff_field->logical_minimum, ff_field->logical_maximum); right = tmff_scale_u16(effect->u.rumble.strong_magnitude, ff_field->logical_minimum, ff_field->logical_maximum); /* 2-in-1 strong motor is left */ if (hid->product == THRUSTMASTER_DEVICE_ID_2_IN_1_DT) swap(left, right); dbg_hid("(left,right)=(%08x, %08x)\n", left, right); ff_field->value[0] = left; ff_field->value[1] = right; hid_hw_request(hid, tmff->report, HID_REQ_SET_REPORT); break; } return 0; } static int tmff_init(struct hid_device *hid, const signed short *ff_bits) { struct tmff_device *tmff; struct hid_report *report; struct list_head *report_list; struct hid_input *hidinput; struct input_dev *input_dev; int error; int i; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hid->inputs.next, struct hid_input, list); input_dev = hidinput->input; tmff = kzalloc(sizeof(struct tmff_device), GFP_KERNEL); if (!tmff) return -ENOMEM; /* Find the report to use */ report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; list_for_each_entry(report, report_list, list) { int fieldnum; for (fieldnum = 0; fieldnum < report->maxfield; ++fieldnum) { struct hid_field *field = report->field[fieldnum]; if (field->maxusage <= 0) continue; switch (field->usage[0].hid) { case THRUSTMASTER_USAGE_FF: if (field->report_count < 2) { hid_warn(hid, "ignoring FF field with report_count < 2\n"); continue; } if (field->logical_maximum == field->logical_minimum) { hid_warn(hid, "ignoring FF field with logical_maximum == logical_minimum\n"); continue; } if (tmff->report && tmff->report != report) { hid_warn(hid, "ignoring FF field in other report\n"); continue; } if (tmff->ff_field && tmff->ff_field != field) { hid_warn(hid, "ignoring duplicate FF field\n"); continue; } tmff->report = report; tmff->ff_field = field; for (i = 0; ff_bits[i] >= 0; i++) set_bit(ff_bits[i], input_dev->ffbit); break; default: hid_warn(hid, "ignoring unknown output usage %08x\n", field->usage[0].hid); continue; } } } if (!tmff->report) { hid_err(hid, "can't find FF field in output reports\n"); error = -ENODEV; goto fail; } error = input_ff_create_memless(input_dev, tmff, tmff_play); if (error) goto fail; hid_info(hid, "force feedback for ThrustMaster devices by Zinx Verituse <zinx@epicsol.org>\n"); return 0; fail: kfree(tmff); return error; } #else static inline int tmff_init(struct hid_device *hid, const signed short *ff_bits) { return 0; } #endif static int tm_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } tmff_init(hdev, (void *)id->driver_data); return 0; err: return ret; } static const struct hid_device_id tm_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb300), .driver_data = (unsigned long)ff_rumble }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb304), /* FireStorm Dual Power 2 (and 3) */ .driver_data = (unsigned long)ff_rumble }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, THRUSTMASTER_DEVICE_ID_2_IN_1_DT), /* Dual Trigger 2-in-1 */ .driver_data = (unsigned long)ff_rumble }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb323), /* Dual Trigger 3-in-1 (PC Mode) */ .driver_data = (unsigned long)ff_rumble }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb324), /* Dual Trigger 3-in-1 (PS3 Mode) */ .driver_data = (unsigned long)ff_rumble }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb605), /* NASCAR PRO FF2 Wheel */ .driver_data = (unsigned long)ff_joystick }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb651), /* FGT Rumble Force Wheel */ .driver_data = (unsigned long)ff_rumble }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb653), /* RGT Force Feedback CLUTCH Raging Wheel */ .driver_data = (unsigned long)ff_joystick }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb654), /* FGT Force Feedback Wheel */ .driver_data = (unsigned long)ff_joystick }, { HID_USB_DEVICE(USB_VENDOR_ID_THRUSTMASTER, 0xb65a), /* F430 Force Feedback Wheel */ .driver_data = (unsigned long)ff_joystick }, { } }; MODULE_DEVICE_TABLE(hid, tm_devices); static struct hid_driver tm_driver = { .name = "thrustmaster", .id_table = tm_devices, .probe = tm_probe, }; module_hid_driver(tm_driver); MODULE_DESCRIPTION("Force feedback support for various HID compliant devices by ThrustMaster"); MODULE_LICENSE("GPL"); |
| 475 98 3 98 63 22 22 22 68 64 1 115 1 3 1 41 23 3 3 3 3 3 3 3 3 3 51 50 2 33 12 12 79 2 7 1 4 16 2 28 1 183 183 183 183 183 40 2 149 149 113 2 138 115 138 1 47 17 3 4 4 3 77 41 12 36 37 60 23 79 45 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 | /* SPDX-License-Identifier: GPL-2.0 */ /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> #include <net/rstreason.h> #define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) #define OPTION_MPTCP_MPJ_SYN BIT(3) #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_RM_ADDR BIT(7) #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) #define OPTION_MPTCP_DSS BIT(11) #define OPTION_MPTCP_FAIL BIT(12) #define OPTION_MPTCP_CSUMREQD BIT(13) #define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ OPTION_MPTCP_MPC_ACK) #define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 #define MPTCPOPT_MP_JOIN 1 #define MPTCPOPT_DSS 2 #define MPTCPOPT_ADD_ADDR 3 #define MPTCPOPT_RM_ADDR 4 #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 #define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_MPJ_SYN 12 #define TCPOLEN_MPTCP_MPJ_SYNACK 16 #define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 #define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 #define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 #define TCPOLEN_MPTCP_ADD_ADDR6 28 #define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 #define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 #define TCPOLEN_MPTCP_FAIL 12 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) #define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) #define MPTCP_DSS_DSN64 BIT(3) #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) /* MPTCP TCPRST flags */ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ #define MPTCP_WORK_RTX 1 #define MPTCP_FALLBACK_DONE 2 #define MPTCP_WORK_CLOSE_SUBFLOW 3 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 #define MPTCP_CLEAN_UNA 2 #define MPTCP_ERROR_REPORT 3 #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 #define MPTCP_DEQUEUE 8 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; u8 has_rxtstamp; u8 cant_coalesce; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; } #define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; u64 data_ack; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; struct_group(status, u16 suboptions; u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, reset_reason:4, reset_transient:1, echo:1, backup:1, deny_join_id0:1, __unused:2; ); u8 join_id; u32 token; u32 nonce; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | ((nib & 0xF) << 8) | field); } enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is * accounted int id_avail_bitmap */ }; enum mptcp_pm_type { MPTCP_PM_TYPE_KERNEL = 0, MPTCP_PM_TYPE_USERSPACE, __MPTCP_PM_TYPE_NR, __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, }; /* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ #define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_RM_ADDR_SIGNAL, }; /* max value of mptcp_addr_info.id */ #define MPTCP_PM_MAX_ADDR_ID U8_MAX struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ struct_group(reset, u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; u8 extra_subflows; u8 status; ); DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; }; struct mptcp_pm_local { struct mptcp_addr_info addr; u8 flags; int ifindex; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; u8 flags; int ifindex; struct socket *lsk; }; struct mptcp_data_frag { struct list_head list; u64 data_seq; u16 data_len; u16 offset; u16 overhead; u16 already_sent; struct page *page; }; /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; u64 local_key; /* protected by the first subflow socket lock * lockless access read */ u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; * recovery related fields are under data_lock * protection */ u64 bytes_acked; u64 snd_una; u64 wnd_end; u32 last_data_sent; u32 last_data_recv; u32 last_ack_recv; unsigned long timer_ival; u32 token; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; u8 pending_state; /* A subflow asked to set this sk_state, * protected by the msk data lock */ u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, nodelay:1, fastopening:1, in_accept_queue:1, free_first:1, rcvspace_init:1; u32 notsent_lowat; int keepalive_cnt; int keepalive_idle; int keepalive_intvl; int maxseg; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; struct sock *first; /* The mptcp ops can safely dereference, using suitable * ONCE annotation, the subflow outside the socket * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { int space; /* bytes copied in last measurement window */ int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; u8 scaling_ratio; bool allow_subflows; u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; spinlock_t fallback_lock; /* protects fallback, * allow_infinite_fallback and * allow_join */ }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) #define mptcp_next_subflow(__msk, __subflow) \ list_next_entry_circular(__subflow, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; static inline void msk_owned_by_me(const struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); } #ifdef CONFIG_DEBUG_NET /* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */ #undef tcp_sk #define tcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \ container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \ }) #define mptcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \ container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \ }) #else /* !CONFIG_DEBUG_NET */ #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } static inline int mptcp_space_from_win(const struct sock *sk, int win) { return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win); } static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - sk_rmem_alloc_get(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); return READ_ONCE(msk->first_pending); } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *cur; cur = msk->first_pending; return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : list_next_entry(cur, list); } static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } struct csum_pseudo_header { __be64 data_seq; __be32 subflow_seq; __be16 data_len; __sum16 csum; }; struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, request_bkup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; u64 idsn; u32 token; u32 ssn_offset; u64 thmac; u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * mptcp_subflow_rsk(const struct request_sock *rsk) { return (struct mptcp_subflow_request_sock *)rsk; } struct mptcp_delegated_action { struct napi_struct napi; local_lock_t bh_lock; struct list_head head; }; DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); #define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_ACK 2 #define MPTCP_DELEGATE_SNDBUF 3 #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ struct_group(reset, unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; u64 map_seq; u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; __wsum map_data_csum; u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, map_csum_reqd : 1, map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ __unused : 9; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ bool fully_established; /* path validated */ u32 remote_nonce; u64 thmac; u32 local_nonce; u32 remote_token; union { u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ u64 iasn; /* initial ack sequence number, MPC subflows only */ }; s16 local_id; /* if negative not initialized yet */ u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; u8 stale_count; u32 subflow_id; long delegated_status; unsigned long fail_tout; ); struct list_head delegated_node; /* link into delegated_action, protected by local BH */ u32 setsockopt_seq; u32 stale_rcv_tstamp; int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf, * protected by the msk socket lock */ struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_state_change)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; } static inline struct sock * mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) { return subflow->tcp_sock; } static inline void mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; WRITE_ONCE(subflow->local_id, -1); } /* Convert reset reasons in MPTCP to enum sk_rst_reason type */ static inline enum sk_rst_reason sk_rst_convert_mptcp_reason(u32 reason) { switch (reason) { case MPTCP_RST_EUNSPEC: return SK_RST_REASON_MPTCP_RST_EUNSPEC; case MPTCP_RST_EMPTCP: return SK_RST_REASON_MPTCP_RST_EMPTCP; case MPTCP_RST_ERESOURCE: return SK_RST_REASON_MPTCP_RST_ERESOURCE; case MPTCP_RST_EPROHIBIT: return SK_RST_REASON_MPTCP_RST_EPROHIBIT; case MPTCP_RST_EWQ2BIG: return SK_RST_REASON_MPTCP_RST_EWQ2BIG; case MPTCP_RST_EBADPERF: return SK_RST_REASON_MPTCP_RST_EBADPERF; case MPTCP_RST_EMIDDLEBOX: return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX; default: /* It should not happen, or else errors may occur * in MPTCP layer */ return SK_RST_REASON_ERROR; } } static inline void mptcp_send_active_reset_reason(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); enum sk_rst_reason reason; reason = sk_rst_convert_mptcp_reason(subflow->reset_reason); tcp_send_active_reset(sk, GFP_ATOMIC, reason); } static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - subflow->ssn_offset - subflow->map_subflow_seq; } static inline u64 mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) { return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); /* The implied barrier pairs with tcp_release_cb_override() * mptcp_napi_poll(), and ensures the below list check sees list * updates done prior to delegated status bits changes */ old = set_mask_bits(&subflow->delegated_status, 0, set_bits); if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); } } static inline struct mptcp_subflow_context * mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); if (list_empty(&delegated->head)) { local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_path_manager(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void mptcp_active_disable(struct sock *sk); bool mptcp_active_should_disable(struct sock *ssk); void mptcp_active_enable(struct sock *sk); void mptcp_get_available_schedulers(char *buf, size_t maxlen); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); void mptcp_remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled); struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) { return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed); } static inline bool mptcp_epollin_ready(const struct sock *sk) { u64 data_avail = mptcp_data_avail(mptcp_sk(sk)); if (!data_avail) return false; /* mptcp doesn't have to deal with small skbs in the receive queue, * as it can always coalesce them */ return (data_avail >= sk->sk_rcvlowat) || tcp_under_memory_pressure(sk); } int mptcp_set_rcvlowat(struct sock *sk, int val); static inline bool __tcp_can_send(const struct sock *ssk) { /* only send if our side has not closed yet */ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !READ_ONCE(subflow->fully_established)) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); void mptcp_subflow_drop_ctx(struct sock *ssk); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void __mptcp_sync_state(struct sock *sk, int state); void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); static inline void mptcp_stop_tout_timer(struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; sk_stop_timer(sk, &sk->sk_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout) { /* avoid 0 timestamp, as that means no close timeout */ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1; } static inline void mptcp_start_tout_timer(struct sock *sk) { mptcp_set_close_tout(sk, tcp_jiffies32); mptcp_reset_tout_timer(mptcp_sk(sk), 0); } static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && READ_ONCE(mptcp_sk(sk)->fully_established); } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) { if (use_64bit) return cur_seq; return __mptcp_expand_seq(old_seq, cur_seq); } void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } static inline u32 mptcp_notsent_lowat(const struct sock *sk) { struct net *net = sock_net(sk); u32 val; val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 notsent_bytes; notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); } static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) { return mptcp_stream_memory_free(sk, wake) && __sk_stream_is_writeable(sk, wake); } static inline void mptcp_write_space(struct sock *sk) { /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (mptcp_stream_memory_free(sk, 1)) sk_stream_write_space(sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) { struct mptcp_subflow_context *subflow; int ssk_sndbuf, new_sndbuf; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); subflow->cached_sndbuf = ssk_sndbuf; new_sndbuf += ssk_sndbuf; } /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf); mptcp_write_space(sk); } /* The called held both the msk socket and the subflow socket locks, * possibly under BH */ static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf) __mptcp_sync_sndbuf(sk); } /* the caller held only the subflow socket lock, either in process or * BH context. Additionally this can be called under the msk data lock, * so we can't acquire such lock here: let the delegate action acquires * the needed locks in suitable order. */ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf)) return; local_bh_disable(); mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); local_bh_enable(); } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { mptcp_subflow_rsk(req)->token_node.pprev = NULL; } int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); void mptcp_pm_destroy(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry); bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow); void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id); void mptcp_pm_rm_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr); bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); /* the default path manager, used in mptcp_pm_unregister */ extern struct mptcp_pm_ops mptcp_pm_kernel; struct mptcp_pm_ops *mptcp_pm_find(const char *name); int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); void mptcp_pm_get_available(char *buf, size_t maxlen); void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); int mptcp_pm_genl_fill_addr(struct sk_buff *msg, struct netlink_callback *cb, struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; } static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; if (family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; /* account for 2 trailing 'nop' options */ if (port) len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN; return len; } static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) { if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) return -EINVAL; return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) { int local_id = READ_ONCE(subflow->local_id); if (local_id < 0) return 0; return local_id; } void __init mptcp_pm_kernel_register(void); void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) { spin_lock_bh(&msk->pm.lock); __mptcp_pm_close_subflow(msk); spin_unlock_bh(&msk->pm.lock); } static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) { return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool mptcp_check_fallback(const struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); return __mptcp_check_fallback(msk); } static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) { struct sock *ssk = READ_ONCE(msk->first); return ssk && ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_LISTEN)); } bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib); static inline bool mptcp_try_fallback(struct sock *ssk, int fb_mib) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(sk); if (!__mptcp_try_fallback(msk, fb_mib)) return false; if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { gfp_t saved_allocation = ssk->sk_allocation; /* we are in a atomic (BH) scope, override ssk default for data * fin allocation */ ssk->sk_allocation = GFP_ATOMIC; ssk->sk_shutdown |= SEND_SHUTDOWN; tcp_shutdown(ssk, SEND_SHUTDOWN); ssk->sk_allocation = saved_allocation; } return true; } static inline void mptcp_early_fallback(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, int fb_mib) { subflow->request_mptcp = 0; WARN_ON_ONCE(!__mptcp_try_fallback(msk, fb_mib)); } static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { struct mptcp_ext *mpext; mpext = skb ? mptcp_get_ext(skb) : NULL; if (mpext && mpext->infinite_map) return true; return false; } static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) { return (subflow->request_mptcp || subflow->request_join); } static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && is_active_ssk(subflow) && !subflow->conn_finished; } #ifdef CONFIG_SYN_COOKIES void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); void __init mptcp_join_cookie_init(void); #else static inline void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) {} static inline bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) { return false; } static inline void mptcp_join_cookie_init(void) {} #endif #endif /* __MPTCP_PROTOCOL_H */ |
| 202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 | // SPDX-License-Identifier: BSD-3-Clause /* * linux/net/sunrpc/auth_gss/auth_gss.c * * RPCSEC_GSS client authentication. * * Copyright (c) 2000 The Regents of the University of Michigan. * All rights reserved. * * Dug Song <dugsong@monkey.org> * Andy Adamson <andros@umich.edu> */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/gss_api.h> #include <linux/uaccess.h> #include <linux/hashtable.h> #include "auth_gss_internal.h" #include "../netns.h" #include <trace/events/rpcgss.h> static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; static const struct rpc_credops gss_nullops; #define GSS_RETRY_EXPIRED 5 static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; #define GSS_KEY_EXPIRE_TIMEO 240 static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif /* * This compile-time check verifies that we will not exceed the * slack space allotted by the client and server auth_gss code * before they call gss_wrap(). */ #define GSS_KRB5_MAX_SLACK_NEEDED \ (GSS_KRB5_TOK_HDR_LEN /* gss token header */ \ + GSS_KRB5_MAX_CKSUM_LEN /* gss token checksum */ \ + GSS_KRB5_MAX_BLOCKSIZE /* confounder */ \ + GSS_KRB5_MAX_BLOCKSIZE /* possible padding */ \ + GSS_KRB5_TOK_HDR_LEN /* encrypted hdr in v2 token */ \ + GSS_KRB5_MAX_CKSUM_LEN /* encryption hmac */ \ + XDR_UNIT * 2 /* RPC verifier */ \ + GSS_KRB5_TOK_HDR_LEN \ + GSS_KRB5_MAX_CKSUM_LEN) #define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2) /* length of a krb5 verifier (48), plus data added before arguments when * using integrity (two 4-byte integers): */ #define GSS_VERF_SLACK 100 static DEFINE_HASHTABLE(gss_auth_hash_table, 4); static DEFINE_SPINLOCK(gss_auth_hash_lock); struct gss_pipe { struct rpc_pipe_dir_object pdo; struct rpc_pipe *pipe; struct rpc_clnt *clnt; const char *name; struct kref kref; }; struct gss_auth { struct kref kref; struct hlist_node hash; struct rpc_auth rpc_auth; struct gss_api_mech *mech; enum rpc_gss_svc service; struct rpc_clnt *client; struct net *net; netns_tracker ns_tracker; /* * There are two upcall pipes; dentry[1], named "gssd", is used * for the new text-based upcall; dentry[0] is named after the * mechanism (for example, "krb5") and exists for * backwards-compatibility with older gssd's. */ struct gss_pipe *gss_pipe[2]; const char *target_name; }; /* pipe_version >= 0 if and only if someone has a pipe open. */ static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); static void gss_put_auth(struct gss_auth *gss_auth); static void gss_free_ctx(struct gss_cl_ctx *); static const struct rpc_pipe_ops gss_upcall_ops_v0; static const struct rpc_pipe_ops gss_upcall_ops_v1; static inline struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx) { refcount_inc(&ctx->count); return ctx; } static inline void gss_put_ctx(struct gss_cl_ctx *ctx) { if (refcount_dec_and_test(&ctx->count)) gss_free_ctx(ctx); } /* gss_cred_set_ctx: * called by gss_upcall_callback and gss_create_upcall in order * to set the gss context. The actual exchange of an old context * and a new one is protected by the pipe->lock. */ static void gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) return; gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } static struct gss_cl_ctx * gss_cred_get_ctx(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (ctx) gss_get_ctx(ctx); rcu_read_unlock(); return ctx; } static struct gss_cl_ctx * gss_alloc_context(void) { struct gss_cl_ctx *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ spin_lock_init(&ctx->gc_seq_lock); refcount_set(&ctx->count,1); } return ctx; } #define GSSD_MIN_TIMEOUT (60 * 60) static const void * gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) { const void *q; unsigned int seclen; unsigned int timeout; unsigned long now = jiffies; u32 window_size; int ret; /* First unsigned int gives the remaining lifetime in seconds of the * credential - e.g. the remaining TGT lifetime for Kerberos or * the -t value passed to GSSD. */ p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); if (IS_ERR(p)) goto err; if (timeout == 0) timeout = GSSD_MIN_TIMEOUT; ctx->gc_expiry = now + ((unsigned long)timeout * HZ); /* Sequence number window. Determines the maximum number of * simultaneous requests */ p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); if (IS_ERR(p)) goto err; ctx->gc_win = window_size; /* gssd signals an error by passing ctx->gc_win = 0: */ if (ctx->gc_win == 0) { /* * in which case, p points to an error code. Anything other * than -EKEYEXPIRED gets converted to -EACCES. */ p = simple_get_bytes(p, end, &ret, sizeof(ret)); if (!IS_ERR(p)) p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) : ERR_PTR(-EACCES); goto err; } /* copy the opaque wire context */ p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); if (IS_ERR(p)) goto err; /* import the opaque security context */ p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); if (IS_ERR(p)) goto err; q = (const void *)((const char *)p + seclen); if (unlikely(q > end || q < p)) { p = ERR_PTR(-EFAULT); goto err; } ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL); if (ret < 0) { trace_rpcgss_import_ctx(ret); p = ERR_PTR(ret); goto err; } /* is there any trailing data? */ if (q == end) { p = q; goto done; } /* pull in acceptor name (if there is one) */ p = simple_get_netobj(q, end, &ctx->gc_acceptor); if (IS_ERR(p)) goto err; done: trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, ctx->gc_acceptor.data); err: return p; } /* XXX: Need some documentation about why UPCALL_BUF_LEN is so small. * Is user space expecting no more than UPCALL_BUF_LEN bytes? * Note that there are now _two_ NI_MAXHOST sized data items * being passed in this string. */ #define UPCALL_BUF_LEN 256 struct gss_upcall_msg { refcount_t count; kuid_t uid; const char *service_name; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; struct rpc_pipe *pipe; struct rpc_wait_queue rpc_waitqueue; wait_queue_head_t waitqueue; struct gss_cl_ctx *ctx; char databuf[UPCALL_BUF_LEN]; }; static int get_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; spin_lock(&pipe_version_lock); if (sn->pipe_version >= 0) { atomic_inc(&sn->pipe_users); ret = sn->pipe_version; } else ret = -EAGAIN; spin_unlock(&pipe_version_lock); return ret; } static void put_pipe_version(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) { sn->pipe_version = -1; spin_unlock(&pipe_version_lock); } } static void gss_release_msg(struct gss_upcall_msg *gss_msg) { struct net *net = gss_msg->auth->net; if (!refcount_dec_and_test(&gss_msg->count)) return; put_pipe_version(net); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); gss_put_auth(gss_msg->auth); kfree_const(gss_msg->service_name); kfree(gss_msg); } static struct gss_upcall_msg * __gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (pos->auth->service != auth->service) continue; refcount_inc(&pos->count); return pos; } return NULL; } /* Try to add an upcall to the pipefs queue. * If an upcall owned by our uid already exists, then we return a reference * to that upcall instead of adding the new upcall. */ static inline struct gss_upcall_msg * gss_add_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; struct gss_upcall_msg *old; spin_lock(&pipe->lock); old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth); if (old == NULL) { refcount_inc(&gss_msg->count); list_add(&gss_msg->list, &pipe->in_downcall); } else gss_msg = old; spin_unlock(&pipe->lock); return gss_msg; } static void __gss_unhash_msg(struct gss_upcall_msg *gss_msg) { list_del_init(&gss_msg->list); rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); wake_up_all(&gss_msg->waitqueue); refcount_dec(&gss_msg->count); } static void gss_unhash_msg(struct gss_upcall_msg *gss_msg) { struct rpc_pipe *pipe = gss_msg->pipe; if (list_empty(&gss_msg->list)) return; spin_lock(&pipe->lock); if (!list_empty(&gss_msg->list)) __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); } static void gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg) { switch (gss_msg->msg.errno) { case 0: if (gss_msg->ctx == NULL) break; clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx); break; case -EKEYEXPIRED: set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags); } gss_cred->gc_upcall_timestamp = jiffies; gss_cred->gc_upcall = NULL; rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); } static void gss_upcall_callback(struct rpc_task *task) { struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; struct rpc_pipe *pipe = gss_msg->pipe; spin_lock(&pipe->lock); gss_handle_downcall_result(gss_cred, gss_msg); spin_unlock(&pipe->lock); task->tk_status = gss_msg->msg.errno; gss_release_msg(gss_msg); } static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } static ssize_t gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->copied == 0) gss_encode_v0_msg(gss_msg, file->f_cred); return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, const char *target_name, const struct cred *cred) { struct user_namespace *userns = cred->user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); int len; len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name, from_kuid_munged(userns, gss_msg->uid)); buflen -= len; p += len; gss_msg->msg.len = len; /* * target= is a full service principal that names the remote * identity that we are authenticating to. */ if (target_name) { len = scnprintf(p, buflen, " target=%s", target_name); buflen -= len; p += len; gss_msg->msg.len += len; } /* * gssd uses service= and srchost= to select a matching key from * the system's keytab to use as the source principal. * * service= is the service name part of the source principal, * or "*" (meaning choose any). * * srchost= is the hostname part of the source principal. When * not provided, gssd uses the local hostname. */ if (service_name) { char *c = strchr(service_name, '@'); if (!c) len = scnprintf(p, buflen, " service=%s", service_name); else len = scnprintf(p, buflen, " service=%.*s srchost=%s", (int)(c - service_name), service_name, c + 1); buflen -= len; p += len; gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { len = scnprintf(p, buflen, " enctypes=%s", mech->gm_upcall_enctypes); buflen -= len; p += len; gss_msg->msg.len += len; } trace_rpcgss_upcall_msg(gss_msg->databuf); len = scnprintf(p, buflen, "\n"); if (len == 0) goto out_overflow; gss_msg->msg.len += len; gss_msg->msg.data = gss_msg->databuf; return 0; out_overflow: WARN_ON_ONCE(1); return -ENOMEM; } static ssize_t gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *buf, size_t buflen) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); int err; if (msg->copied == 0) { err = gss_encode_v1_msg(gss_msg, gss_msg->service_name, gss_msg->auth->target_name, file->f_cred); if (err) return err; } return rpc_pipe_generic_upcall(file, msg, buf, buflen); } static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, kuid_t uid, const char *service_name) { struct gss_upcall_msg *gss_msg; int vers; int err = -ENOMEM; gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); err = vers; if (err < 0) goto err_free_msg; gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe; INIT_LIST_HEAD(&gss_msg->list); rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); init_waitqueue_head(&gss_msg->waitqueue); refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; kref_get(&gss_auth->kref); if (service_name) { gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL); if (!gss_msg->service_name) { err = -ENOMEM; goto err_put_pipe_version; } } return gss_msg; err_put_pipe_version: put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); err: return ERR_PTR(err); } static struct gss_upcall_msg * gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; kuid_t uid = cred->cr_cred->fsuid; gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) return gss_new; gss_msg = gss_add_msg(gss_new); if (gss_msg == gss_new) { int res; refcount_inc(&gss_msg->count); res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg); if (res) { gss_unhash_msg(gss_new); refcount_dec(&gss_msg->count); gss_release_msg(gss_new); gss_msg = ERR_PTR(res); } } else gss_release_msg(gss_new); return gss_msg; } static void warn_gssd(void) { dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int gss_refresh_upcall(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe; int err = 0; gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we * shouldn't normally hit this case on a refresh. */ warn_gssd(); rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue, task, NULL, jiffies + (15 * HZ)); err = -EAGAIN; goto out; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; spin_lock(&pipe->lock); if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ refcount_inc(&gss_msg->count); rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback); } else { gss_handle_downcall_result(gss_cred, gss_msg); err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct net *net = gss_auth->net; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; DEFINE_WAIT(wait); int err; retry: err = 0; /* if gssd is down, just skip upcalling altogether */ if (!gssd_running(net)) { warn_gssd(); err = -EACCES; goto out; } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { warn_gssd(); err = -EACCES; } if (err < 0) goto out; goto retry; } if (IS_ERR(gss_msg)) { err = PTR_ERR(gss_msg); goto out; } pipe = gss_msg->pipe; for (;;) { prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE); spin_lock(&pipe->lock); if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { break; } spin_unlock(&pipe->lock); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_intr; } schedule(); } if (gss_msg->ctx) { trace_rpcgss_ctx_init(gss_cred); gss_cred_set_ctx(cred, gss_msg->ctx); } else { err = gss_msg->msg.errno; } spin_unlock(&pipe->lock); out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: trace_rpcgss_upcall_result(from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } static struct gss_upcall_msg * gss_find_downcall(struct rpc_pipe *pipe, kuid_t uid) { struct gss_upcall_msg *pos; list_for_each_entry(pos, &pipe->in_downcall, list) { if (!uid_eq(pos->uid, uid)) continue; if (!rpc_msg_is_inflight(&pos->msg)) continue; refcount_inc(&pos->count); return pos; } return NULL; } #define MSG_BUF_MAXSIZE 1024 static ssize_t gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { const void *p, *end; void *buf; struct gss_upcall_msg *gss_msg; struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe; struct gss_cl_ctx *ctx; uid_t id; kuid_t uid; ssize_t err = -EFBIG; if (mlen > MSG_BUF_MAXSIZE) goto out; err = -ENOMEM; buf = kmalloc(mlen, GFP_KERNEL); if (!buf) goto out; err = -EFAULT; if (copy_from_user(buf, src, mlen)) goto err; end = (const void *)((char *)buf + mlen); p = simple_get_bytes(buf, end, &id, sizeof(id)); if (IS_ERR(p)) { err = PTR_ERR(p); goto err; } uid = make_kuid(current_user_ns(), id); if (!uid_valid(uid)) { err = -EINVAL; goto err; } err = -ENOMEM; ctx = gss_alloc_context(); if (ctx == NULL) goto err; err = -ENOENT; /* Find a matching upcall */ spin_lock(&pipe->lock); gss_msg = gss_find_downcall(pipe, uid); if (gss_msg == NULL) { spin_unlock(&pipe->lock); goto err_put_ctx; } list_del_init(&gss_msg->list); spin_unlock(&pipe->lock); p = gss_fill_context(p, end, ctx, gss_msg->auth->mech); if (IS_ERR(p)) { err = PTR_ERR(p); switch (err) { case -EACCES: case -EKEYEXPIRED: gss_msg->msg.errno = err; err = mlen; break; case -EFAULT: case -ENOMEM: case -EINVAL: case -ENOSYS: gss_msg->msg.errno = -EAGAIN; break; default: printk(KERN_CRIT "%s: bad return from " "gss_fill_context: %zd\n", __func__, err); gss_msg->msg.errno = -EIO; } goto err_release_msg; } gss_msg->ctx = gss_get_ctx(ctx); err = mlen; err_release_msg: spin_lock(&pipe->lock); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); err_put_ctx: gss_put_ctx(ctx); err: kfree(buf); out: return err; } static int gss_pipe_open(struct inode *inode, int new_version) { struct net *net = inode->i_sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; spin_lock(&pipe_version_lock); if (sn->pipe_version < 0) { /* First open of any gss pipe determines the version: */ sn->pipe_version = new_version; rpc_wake_up(&pipe_version_rpc_waitqueue); wake_up(&pipe_version_waitqueue); } else if (sn->pipe_version != new_version) { /* Trying to open a pipe of a different version */ ret = -EBUSY; goto out; } atomic_inc(&sn->pipe_users); out: spin_unlock(&pipe_version_lock); return ret; } static int gss_pipe_open_v0(struct inode *inode) { return gss_pipe_open(inode, 0); } static int gss_pipe_open_v1(struct inode *inode) { return gss_pipe_open(inode, 1); } static void gss_pipe_release(struct inode *inode) { struct net *net = inode->i_sb->s_fs_info; struct rpc_pipe *pipe = RPC_I(inode)->pipe; struct gss_upcall_msg *gss_msg; restart: spin_lock(&pipe->lock); list_for_each_entry(gss_msg, &pipe->in_downcall, list) { if (!list_empty(&gss_msg->msg.list)) continue; gss_msg->msg.errno = -EPIPE; refcount_inc(&gss_msg->count); __gss_unhash_msg(gss_msg); spin_unlock(&pipe->lock); gss_release_msg(gss_msg); goto restart; } spin_unlock(&pipe->lock); put_pipe_version(net); } static void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { refcount_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) warn_gssd(); gss_release_msg(gss_msg); } gss_release_msg(gss_msg); } static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; rpc_unlink(gss_pipe->pipe); } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { .create = gss_pipe_dentry_create, .destroy = gss_pipe_dentry_destroy, }; static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct gss_pipe *p; int err = -ENOMEM; p = kmalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) goto err; p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(p->pipe)) { err = PTR_ERR(p->pipe); goto err_free_gss_pipe; } p->name = name; p->clnt = clnt; kref_init(&p->kref); rpc_init_pipe_dir_object(&p->pdo, &gss_pipe_dir_object_ops, p); return p; err_free_gss_pipe: kfree(p); err: return ERR_PTR(err); } struct gss_alloc_pdo { struct rpc_clnt *clnt; const char *name; const struct rpc_pipe_ops *upcall_ops; }; static int gss_pipe_match_pdo(struct rpc_pipe_dir_object *pdo, void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; if (pdo->pdo_ops != &gss_pipe_dir_object_ops) return 0; gss_pipe = container_of(pdo, struct gss_pipe, pdo); if (strcmp(gss_pipe->name, args->name) != 0) return 0; if (!kref_get_unless_zero(&gss_pipe->kref)) return 0; return 1; } static struct rpc_pipe_dir_object *gss_pipe_alloc_pdo(void *data) { struct gss_pipe *gss_pipe; struct gss_alloc_pdo *args = data; gss_pipe = gss_pipe_alloc(args->clnt, args->name, args->upcall_ops); if (!IS_ERR(gss_pipe)) return &gss_pipe->pdo; return NULL; } static struct gss_pipe *gss_pipe_get(struct rpc_clnt *clnt, const char *name, const struct rpc_pipe_ops *upcall_ops) { struct net *net = rpc_net_ns(clnt); struct rpc_pipe_dir_object *pdo; struct gss_alloc_pdo args = { .clnt = clnt, .name = name, .upcall_ops = upcall_ops, }; pdo = rpc_find_or_alloc_pipe_dir_object(net, &clnt->cl_pipedir_objects, gss_pipe_match_pdo, gss_pipe_alloc_pdo, &args); if (pdo != NULL) return container_of(pdo, struct gss_pipe, pdo); return ERR_PTR(-ENOMEM); } static void __gss_pipe_free(struct gss_pipe *p) { struct rpc_clnt *clnt = p->clnt; struct net *net = rpc_net_ns(clnt); rpc_remove_pipe_dir_object(net, &clnt->cl_pipedir_objects, &p->pdo); rpc_destroy_pipe_data(p->pipe); kfree(p); } static void __gss_pipe_release(struct kref *kref) { struct gss_pipe *p = container_of(kref, struct gss_pipe, kref); __gss_pipe_free(p); } static void gss_pipe_free(struct gss_pipe *p) { if (p != NULL) kref_put(&p->kref, __gss_pipe_release); } /* * NOTE: we have the opportunity to use different * parameters based on the input flavor (which must be a pseudoflavor) */ static struct gss_auth * gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { rpc_authflavor_t flavor = args->pseudoflavor; struct gss_auth *gss_auth; struct gss_pipe *gss_pipe; struct rpc_auth * auth; int err = -ENOMEM; /* XXX? */ if (!try_module_get(THIS_MODULE)) return ERR_PTR(err); if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) goto out_dec; INIT_HLIST_NODE(&gss_auth->hash); gss_auth->target_name = NULL; if (args->target_name) { gss_auth->target_name = kstrdup(args->target_name, GFP_KERNEL); if (gss_auth->target_name == NULL) goto err_free; } gss_auth->client = clnt; gss_auth->net = get_net_track(rpc_net_ns(clnt), &gss_auth->ns_tracker, GFP_KERNEL); err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) goto err_put_net; gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); if (gss_auth->service == 0) goto err_put_mech; if (!gssd_running(gss_auth->net)) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE); auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags); auth->au_ops = &authgss_ops; auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags); refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); if (err) goto err_put_mech; /* * Note: if we created the old pipe first, then someone who * examined the directory at the right moment might conclude * that we supported only the old pipe. So we instead create * the new pipe first. */ gss_pipe = gss_pipe_get(clnt, "gssd", &gss_upcall_ops_v1); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_credcache; } gss_auth->gss_pipe[1] = gss_pipe; gss_pipe = gss_pipe_get(clnt, gss_auth->mech->gm_name, &gss_upcall_ops_v0); if (IS_ERR(gss_pipe)) { err = PTR_ERR(gss_pipe); goto err_destroy_pipe_1; } gss_auth->gss_pipe[0] = gss_pipe; return gss_auth; err_destroy_pipe_1: gss_pipe_free(gss_auth->gss_pipe[1]); err_destroy_credcache: rpcauth_destroy_credcache(auth); err_put_mech: gss_mech_put(gss_auth->mech); err_put_net: put_net_track(gss_auth->net, &gss_auth->ns_tracker); err_free: kfree(gss_auth->target_name); kfree(gss_auth); out_dec: module_put(THIS_MODULE); trace_rpcgss_createauth(flavor, err); return ERR_PTR(err); } static void gss_free(struct gss_auth *gss_auth) { gss_pipe_free(gss_auth->gss_pipe[0]); gss_pipe_free(gss_auth->gss_pipe[1]); gss_mech_put(gss_auth->mech); put_net_track(gss_auth->net, &gss_auth->ns_tracker); kfree(gss_auth->target_name); kfree(gss_auth); module_put(THIS_MODULE); } static void gss_free_callback(struct kref *kref) { struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref); gss_free(gss_auth); } static void gss_put_auth(struct gss_auth *gss_auth) { kref_put(&gss_auth->kref, gss_free_callback); } static void gss_destroy(struct rpc_auth *auth) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); if (hash_hashed(&gss_auth->hash)) { spin_lock(&gss_auth_hash_lock); hash_del(&gss_auth->hash); spin_unlock(&gss_auth_hash_lock); } gss_pipe_free(gss_auth->gss_pipe[0]); gss_auth->gss_pipe[0] = NULL; gss_pipe_free(gss_auth->gss_pipe[1]); gss_auth->gss_pipe[1] = NULL; rpcauth_destroy_credcache(auth); gss_put_auth(gss_auth); } /* * Auths may be shared between rpc clients that were cloned from a * common client with the same xprt, if they also share the flavor and * target_name. * * The auth is looked up from the oldest parent sharing the same * cl_xprt, and the auth itself references only that common parent * (which is guaranteed to last as long as any of its descendants). */ static struct gss_auth * gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt, struct gss_auth *new) { struct gss_auth *gss_auth; unsigned long hashval = (unsigned long)clnt; spin_lock(&gss_auth_hash_lock); hash_for_each_possible(gss_auth_hash_table, gss_auth, hash, hashval) { if (gss_auth->client != clnt) continue; if (gss_auth->rpc_auth.au_flavor != args->pseudoflavor) continue; if (gss_auth->target_name != args->target_name) { if (gss_auth->target_name == NULL) continue; if (args->target_name == NULL) continue; if (strcmp(gss_auth->target_name, args->target_name)) continue; } if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } if (new) hash_add(gss_auth_hash_table, &new->hash, hashval); gss_auth = new; out: spin_unlock(&gss_auth_hash_lock); return gss_auth; } static struct gss_auth * gss_create_hashed(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct gss_auth *new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, NULL); if (gss_auth != NULL) goto out; new = gss_create_new(args, clnt); if (IS_ERR(new)) return new; gss_auth = gss_auth_find_or_add_hashed(args, clnt, new); if (gss_auth != new) gss_destroy(&new->rpc_auth); out: return gss_auth; } static struct rpc_auth * gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct gss_auth *gss_auth; struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch); while (clnt != clnt->cl_parent) { struct rpc_clnt *parent = clnt->cl_parent; /* Find the original parent for this transport */ if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps) break; clnt = parent; } gss_auth = gss_create_hashed(args, clnt); if (IS_ERR(gss_auth)) return ERR_CAST(gss_auth); return &gss_auth->rpc_auth; } static struct gss_cred * gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, }; struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); rpcauth_init_cred(&new->gc_base, &acred, &gss_auth->rpc_auth, &gss_nullops); new->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; new->gc_service = gss_cred->gc_service; new->gc_principal = gss_cred->gc_principal; kref_get(&gss_auth->kref); rcu_assign_pointer(new->gc_ctx, ctx); gss_get_ctx(ctx); } return new; } /* * gss_send_destroy_context will cause the RPCSEC_GSS to send a NULL RPC call * to the server with the GSS control procedure field set to * RPC_GSS_PROC_DESTROY. This should normally cause the server to release * all RPCSEC_GSS state associated with that context. */ static void gss_send_destroy_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); struct gss_cred *new; struct rpc_task *task; new = gss_dup_cred(gss_auth, gss_cred); if (new) { ctx->gc_proc = RPC_GSS_PROC_DESTROY; trace_rpcgss_ctx_destroy(gss_cred); task = rpc_call_null(gss_auth->client, &new->gc_base, RPC_TASK_ASYNC); if (!IS_ERR(task)) rpc_put_task(task); put_rpccred(&new->gc_base); } } /* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure * to create a new cred or context, so they check that things have been * allocated before freeing them. */ static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); kfree(ctx->gc_acceptor.data); kfree(ctx); } static void gss_free_ctx_callback(struct rcu_head *head) { struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu); gss_do_free_ctx(ctx); } static void gss_free_ctx(struct gss_cl_ctx *ctx) { call_rcu(&ctx->gc_rcu, gss_free_ctx_callback); } static void gss_free_cred(struct gss_cred *gss_cred) { kfree(gss_cred); } static void gss_free_cred_callback(struct rcu_head *head) { struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu); gss_free_cred(gss_cred); } static void gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); put_cred(cred->cr_cred); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); gss_put_auth(gss_auth); } static void gss_destroy_cred(struct rpc_cred *cred) { if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) gss_send_destroy_context(cred); gss_destroy_nullcred(cred); } static int gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) { return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits); } /* * Lookup RPCSEC_GSS cred for the current process */ static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { return rpcauth_lookup_credcache(auth, acred, flags, rpc_task_gfp_mask()); } static struct rpc_cred * gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *cred = NULL; int err = -ENOMEM; if (!(cred = kzalloc(sizeof(*cred), gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); /* * Note: in order to force a call to call_refresh(), we deliberately * fail to flag the credential as RPCAUTH_CRED_UPTODATE. */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; cred->gc_principal = acred->principal; kref_get(&gss_auth->kref); return &cred->gc_base; out_err: return ERR_PTR(err); } static int gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base); int err; do { err = gss_create_upcall(gss_auth, gss_cred); } while (err == -EAGAIN); return err; } static char * gss_stringify_acceptor(struct rpc_cred *cred) { char *string = NULL; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned int len; struct xdr_netobj *acceptor; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx) goto out; len = ctx->gc_acceptor.len; rcu_read_unlock(); /* no point if there's no string */ if (!len) return NULL; realloc: string = kmalloc(len + 1, GFP_KERNEL); if (!string) return NULL; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); /* did the ctx disappear or was it replaced by one with no acceptor? */ if (!ctx || !ctx->gc_acceptor.len) { kfree(string); string = NULL; goto out; } acceptor = &ctx->gc_acceptor; /* * Did we find a new acceptor that's longer than the original? Allocate * a longer buffer and try again. */ if (len < acceptor->len) { len = acceptor->len; rcu_read_unlock(); kfree(string); goto realloc; } memcpy(string, acceptor->data, acceptor->len); string[acceptor->len] = '\0'; out: rcu_read_unlock(); return string; } /* * Returns -EACCES if GSS context is NULL or will expire within the * timeout (miliseconds) */ static int gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ); int ret = 0; rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(timeout, ctx->gc_expiry)) ret = -EACCES; rcu_read_unlock(); return ret; } static int gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; int ret; if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ rcu_read_lock(); ctx = rcu_dereference(gss_cred->gc_ctx); if (!ctx || time_after(jiffies, ctx->gc_expiry)) { rcu_read_unlock(); return 0; } rcu_read_unlock(); if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: if (acred->principal != NULL) { if (gss_cred->gc_principal == NULL) return 0; ret = strcmp(acred->principal, gss_cred->gc_principal) == 0; } else { if (gss_cred->gc_principal != NULL) return 0; ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid); } return ret; } /* * Marshal credentials. * * The expensive part is computing the verifier. We can't cache a * pre-computed version of the verifier because the seqno, which * is different every time, is included in the MIC. */ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *cred_len; u32 maj_stat = 0; struct xdr_netobj mic; struct kvec iov; struct xdr_buf verf_buf; int status; u32 seqno; /* Credential */ p = xdr_reserve_space(xdr, 7 * sizeof(*p) + ctx->gc_wire_ctx.len); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; cred_len = p++; spin_lock(&ctx->gc_seq_lock); seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); /* Verifier */ /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ iov.iov_base = req->rq_snd_buf.head[0].iov_base; iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); p = xdr_reserve_space(xdr, sizeof(*p)); if (!p) goto marshal_failed; *p++ = rpc_auth_gss; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) goto expired; else if (maj_stat != 0) goto bad_mic; if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto marshal_failed; status = 0; out: gss_put_ctx(ctx); return status; expired: clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); status = -EKEYEXPIRED; goto out; marshal_failed: status = -EMSGSIZE; goto out; bad_mic: trace_rpcgss_get_mic(task, maj_stat); status = -EIO; goto out; } static int gss_renew_cred(struct rpc_task *task) { struct rpc_cred *oldcred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(oldcred, struct gss_cred, gc_base); struct rpc_auth *auth = oldcred->cr_auth; struct auth_cred acred = { .cred = oldcred->cr_cred, .principal = gss_cred->gc_principal, }; struct rpc_cred *new; new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; } static int gss_cred_is_negative_entry(struct rpc_cred *cred) { if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) { unsigned long now = jiffies; unsigned long begin, expire; struct gss_cred *gss_cred; gss_cred = container_of(cred, struct gss_cred, gc_base); begin = gss_cred->gc_upcall_timestamp; expire = begin + gss_expired_cred_retry_delay * HZ; if (time_in_range_open(now, begin, expire)) return 1; } return 0; } /* * Refresh credentials. XXX - finish */ static int gss_refresh(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; int ret = 0; if (gss_cred_is_negative_entry(cred)) return -EKEYEXPIRED; if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { ret = gss_renew_cred(task); if (ret < 0) goto out; cred = task->tk_rqstp->rq_cred; } if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)) ret = gss_refresh_upcall(task); out: return ret; } /* Dummy refresh routine: used only when destroying the context */ static int gss_refresh_null(struct rpc_task *task) { return 0; } static u32 gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) { struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; *seq = cpu_to_be32(seqno); iov.iov_base = seq; iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_buf); mic.data = (u8 *)p; mic.len = len; return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); } static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; u32 len, maj_stat; int status; int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) goto validate_failed; if (*p++ != rpc_auth_gss) goto validate_failed; len = be32_to_cpup(p); if (len > RPC_MAX_AUTH_SIZE) goto validate_failed; p = xdr_inline_decode(xdr, len); if (!p) goto validate_failed; seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i++], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) goto bad_mic; /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags)) cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; status = 0; out: gss_put_ctx(ctx); kfree(seq); return status; validate_failed: status = -EIO; goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); status = -EACCES; goto out; } static noinline_for_stack int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; struct xdr_netobj mic; __be32 *p, *integ_len; u32 offset, maj_stat; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; integ_len = p++; *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; if (xdr_buf_subsegment(snd_buf, &integ_buf, offset, snd_buf->len - offset)) goto wrap_failed; *integ_len = cpu_to_be32(integ_buf.len); p = xdr_reserve_space(xdr, 0); if (!p) goto wrap_failed; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_mic; /* Check that the trailing MIC fit in the buffer, after the fact */ if (xdr_stream_encode_opaque_inline(xdr, (void **)&p, mic.len) < 0) goto wrap_failed; return 0; wrap_failed: return -EMSGSIZE; bad_mic: trace_rpcgss_get_mic(task, maj_stat); return -EIO; } static void priv_release_snd_buf(struct rpc_rqst *rqstp) { int i; for (i=0; i < rqstp->rq_enc_pages_num; i++) __free_page(rqstp->rq_enc_pages[i]); kfree(rqstp->rq_enc_pages); rqstp->rq_release_snd_buf = NULL; } static int alloc_enc_pages(struct rpc_rqst *rqstp) { struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; int first, last, i; if (rqstp->rq_release_snd_buf) rqstp->rq_release_snd_buf(rqstp); if (snd_buf->page_len == 0) { rqstp->rq_enc_pages_num = 0; return 0; } first = snd_buf->page_base >> PAGE_SHIFT; last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages = kmalloc_array(rqstp->rq_enc_pages_num, sizeof(struct page *), GFP_KERNEL); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL); if (rqstp->rq_enc_pages[i] == NULL) goto out_free; } rqstp->rq_release_snd_buf = priv_release_snd_buf; return 0; out_free: rqstp->rq_enc_pages_num = i; priv_release_snd_buf(rqstp); out: return -EAGAIN; } static noinline_for_stack int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; u32 pad, offset, maj_stat; int status; __be32 *p, *opaque_len; struct page **inpages; int first; struct kvec *iov; status = -EIO; p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto wrap_failed; opaque_len = p++; *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; status = alloc_enc_pages(rqstp); if (unlikely(status)) goto wrap_failed; first = snd_buf->page_base >> PAGE_SHIFT; inpages = snd_buf->pages + first; snd_buf->pages = rqstp->rq_enc_pages; snd_buf->page_base -= first << PAGE_SHIFT; /* * Move the tail into its own page, in case gss_wrap needs * more space in the head when wrapping. * * Still... Why can't gss_wrap just slide the tail down? */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) { char *tmp; tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); snd_buf->tail[0].iov_base = tmp; } offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ if (unlikely(snd_buf->len > snd_buf->buflen)) { status = -EIO; goto wrap_failed; } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); else if (maj_stat) goto bad_wrap; *opaque_len = cpu_to_be32(snd_buf->len - offset); /* guess whether the pad goes into the head or the tail: */ if (snd_buf->page_len || snd_buf->tail[0].iov_len) iov = snd_buf->tail; else iov = snd_buf->head; p = iov->iov_base + iov->iov_len; pad = xdr_pad_size(snd_buf->len - offset); memset(p, 0, pad); iov->iov_len += pad; snd_buf->len += pad; return 0; wrap_failed: return status; bad_wrap: trace_rpcgss_wrap(task, maj_stat); return -EIO; } static int gss_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status; status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. */ status = rpcauth_wrap_req_encode(task, xdr); goto out; } switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = rpcauth_wrap_req_encode(task, xdr); break; case RPC_GSS_SVC_INTEGRITY: status = gss_wrap_req_integ(cred, ctx, task, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_wrap_req_priv(cred, ctx, task, xdr); break; default: status = -EIO; } out: gss_put_ctx(ctx); return status; } /** * gss_update_rslack - Possibly update RPC receive buffer size estimates * @task: rpc_task for incoming RPC Reply being unwrapped * @cred: controlling rpc_cred for @task * @before: XDR words needed before each RPC Reply message * @after: XDR words needed following each RPC Reply message * */ static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred, unsigned int before, unsigned int after) { struct rpc_auth *auth = cred->cr_auth; if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) { auth->au_ralign = auth->au_verfsize + before; auth->au_rslack = auth->au_verfsize + after; trace_rpcgss_update_slack(task, auth); } } static int gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred) { gss_update_rslack(task, cred, 0, 0); return 0; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; int ret; ret = -EIO; mic.data = NULL; /* opaque databody_integ<>; */ if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the beginning of the * upper layer payload, to be passed below to * rpcauth_unwrap_resp_decode(). The checksum, which * follows the upper layer payload in @rcv_buf, is * located and parsed without updating the xdr_stream. */ /* opaque checksum<>; */ offset += len; if (xdr_decode_word(rcv_buf, offset, &len)) goto unwrap_failed; offset += sizeof(__be32); if (offset + len > rcv_buf->len) goto unwrap_failed; mic.len = len; mic.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(mic.data)) goto unwrap_failed; if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len)); ret = 0; out: kfree(mic.data); return ret; unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); goto out; } static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; u32 offset, opaque_len, maj_stat; __be32 *p; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (unlikely(!p)) goto unwrap_failed; opaque_len = be32_to_cpup(p++); offset = (u8 *)(p) - (u8 *)head->iov_base; if (offset + opaque_len > rcv_buf->len) goto unwrap_failed; maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, offset + opaque_len, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. * rcv_buf has changed, thus the stream needs to be reset. */ xdr_init_decode(xdr, rcv_buf, p, rqstp); gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align, 2 + ctx->gc_gss_ctx->slack); return 0; unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); return -EIO; } static bool gss_seq_is_newer(u32 new, u32 old) { return (s32)(new - old) > 0; } static bool gss_xmit_need_reencode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *cred = req->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 win, seq_xmit = 0; bool ret = true; if (!ctx) goto out; if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; } } win = ctx->gc_win; if (win > 0) ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); out: trace_rpcgss_need_reencode(task, seq_xmit, ret); return ret; } static int gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct rpc_cred *cred = rqstp->rq_cred; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) goto out_decode; switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: status = gss_unwrap_resp_auth(task, cred); break; case RPC_GSS_SVC_INTEGRITY: status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr); break; case RPC_GSS_SVC_PRIVACY: status = gss_unwrap_resp_priv(task, cred, ctx, rqstp, xdr); break; } if (status) goto out; out_decode: status = rpcauth_unwrap_resp_decode(task, xdr); out: gss_put_ctx(ctx); return status; } static const struct rpc_authops authgss_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_GSS, .au_name = "RPCSEC_GSS", .create = gss_create, .destroy = gss_destroy, .hash_cred = gss_hash_cred, .lookup_cred = gss_lookup_cred, .crcreate = gss_create_cred, .info2flavor = gss_mech_info2flavor, .flavor2info = gss_mech_flavor2info, }; static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_nullcred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, .crvalidate = gss_validate, .crwrap_req = gss_wrap_req, .crunwrap_resp = gss_unwrap_resp, .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { .upcall = gss_v0_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, .release_pipe = gss_pipe_release, }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { .upcall = gss_v1_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, .release_pipe = gss_pipe_release, }; static __net_init int rpcsec_gss_init_net(struct net *net) { return gss_svc_init_net(net); } static __net_exit void rpcsec_gss_exit_net(struct net *net) { gss_svc_shutdown_net(net); } static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, }; /* * Initialize RPCSEC_GSS module */ static int __init init_rpcsec_gss(void) { int err = 0; err = rpcauth_register(&authgss_ops); if (err) goto out; err = gss_svc_init(); if (err) goto out_unregister; err = register_pernet_subsys(&rpcsec_gss_net_ops); if (err) goto out_svc_exit; rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version"); return 0; out_svc_exit: gss_svc_shutdown(); out_unregister: rpcauth_unregister(&authgss_ops); out: return err; } static void __exit exit_rpcsec_gss(void) { unregister_pernet_subsys(&rpcsec_gss_net_ops); gss_svc_shutdown(); rpcauth_unregister(&authgss_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_ALIAS("rpc-auth-6"); MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, uint, 0644); MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until " "the RPC engine retries an expired credential"); module_param_named(key_expire_timeo, gss_key_expire_timeo, uint, 0644); MODULE_PARM_DESC(key_expire_timeo, "Time (in seconds) at the end of a " "credential keys lifetime where the NFS layer cleans up " "prior to key expiration"); module_init(init_rpcsec_gss) module_exit(exit_rpcsec_gss) |
| 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_LOADER_H #define __FIRMWARE_LOADER_H #include <linux/bitops.h> #include <linux/firmware.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/completion.h> /** * enum fw_opt - options to control firmware loading behaviour * * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent * when the firmware is not found. Userspace is in charge to load the * firmware using the sysfs loading facility. * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous. * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct * filesystem lookup fails at finding the firmware. For details refer to * firmware_fallback_sysfs(). * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages. * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to * cache the firmware upon suspend, so that upon resume races against the * firmware file lookup on storage is avoided. Used for calls where the * file may be too big, or where the driver takes charge of its own * firmware caching mechanism. * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes * precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER. * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in * the platform's main firmware. If both this fallback and the sysfs * fallback are enabled, then this fallback will be tried first. * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read * entire file. */ enum fw_opt { FW_OPT_UEVENT = BIT(0), FW_OPT_NOWAIT = BIT(1), FW_OPT_USERHELPER = BIT(2), FW_OPT_NO_WARN = BIT(3), FW_OPT_NOCACHE = BIT(4), FW_OPT_NOFALLBACK_SYSFS = BIT(5), FW_OPT_FALLBACK_PLATFORM = BIT(6), FW_OPT_PARTIAL = BIT(7), }; enum fw_status { FW_STATUS_UNKNOWN, FW_STATUS_LOADING, FW_STATUS_DONE, FW_STATUS_ABORTED, }; /* * Concurrent request_firmware() for the same firmware need to be * serialized. struct fw_state is simple state machine which hold the * state of the firmware loading. */ struct fw_state { struct completion completion; enum fw_status status; }; struct fw_priv { struct kref ref; struct list_head list; struct firmware_cache *fwc; struct fw_state fw_st; void *data; size_t size; size_t allocated_size; size_t offset; u32 opt_flags; #ifdef CONFIG_FW_LOADER_PAGED_BUF bool is_paged_buf; struct page **pages; int nr_pages; int page_array_size; #endif #ifdef CONFIG_FW_LOADER_USER_HELPER bool need_uevent; struct list_head pending_list; #endif const char *fw_name; }; extern struct mutex fw_lock; extern struct firmware_cache fw_cache; extern bool fw_load_abort_all; static inline bool __fw_state_check(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; return fw_st->status == status; } static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout) { struct fw_state *fw_st = &fw_priv->fw_st; long ret; ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout); if (ret != 0 && fw_st->status == FW_STATUS_ABORTED) return -ENOENT; if (!ret) return -ETIMEDOUT; return ret < 0 ? ret : 0; } static inline void __fw_state_set(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; WRITE_ONCE(fw_st->status, status); if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) { #ifdef CONFIG_FW_LOADER_USER_HELPER /* * Doing this here ensures that the fw_priv is deleted from * the pending list in all abort/done paths. */ list_del_init(&fw_priv->pending_list); #endif complete_all(&fw_st->completion); } } static inline void fw_state_aborted(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_ABORTED); } static inline bool fw_state_is_aborted(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_ABORTED); } static inline void fw_state_start(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_LOADING); } static inline void fw_state_done(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_done(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_loading(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_LOADING); } int alloc_lookup_fw_priv(const char *fw_name, struct firmware_cache *fwc, struct fw_priv **fw_priv, void *dbuf, size_t size, size_t offset, u32 opt_flags); int assign_fw(struct firmware *fw, struct device *device); void free_fw_priv(struct fw_priv *fw_priv); void fw_state_init(struct fw_priv *fw_priv); #ifdef CONFIG_FW_LOADER bool firmware_is_builtin(const struct firmware *fw); bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size); #else /* module case */ static inline bool firmware_is_builtin(const struct firmware *fw) { return false; } static inline bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { return false; } #endif #ifdef CONFIG_FW_LOADER_PAGED_BUF void fw_free_paged_buf(struct fw_priv *fw_priv); int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed); int fw_map_paged_buf(struct fw_priv *fw_priv); bool fw_is_paged_buf(struct fw_priv *fw_priv); #else static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {} static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; } static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; } static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; } #endif #endif /* __FIRMWARE_LOADER_H */ |
| 10 7 10 18 3 17 5 2 39 21 19 3 3 3 3 7 5 8 8 5 3 8 13 13 11 2 13 13 1 13 7 7 6 7 6 9 29 2 4 2 1 6 10 4 11 4 13 7 6 14 10 13 11 6 6 3 1 8 20 6 6 5 18 10 7 10 3 18 43 43 3 40 43 7 40 12 12 12 12 5 1 14 17 17 17 12 2 14 3 12 4 1 7 36 2 1 5 33 21 21 36 1 1 1 1 2 5 23 2 12 2 8 2 5 1 23 20 9 1 3 2 5 1 2 4 6 12 6 1 6 7 3 3 1 25 13 2 2 1 1 2 4 14 1 13 3 10 13 7 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. */ #include <linux/module.h> #include <linux/netlink.h> #include <linux/qrtr.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/spinlock.h> #include <linux/wait.h> #include <net/sock.h> #include "qrtr.h" #define QRTR_PROTO_VER_1 1 #define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) #define QRTR_PORT_CTRL_LEGACY 0xffff /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node * @src_port_id: source port * @confirm_rx: boolean; whether a resume-tx packet should be send in reply * @size: length of packet, excluding this header * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; __le32 src_port_id; __le32 confirm_rx; __le32 size; __le32 dst_node_id; __le32 dst_port_id; } __packed; /** * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @flags: bitmask of QRTR_FLAGS_* * @optlen: length of optional header data * @size: length of packet, excluding this header and optlen * @src_node_id: source node * @src_port_id: source port * @dst_node_id: destination node * @dst_port_id: destination port */ struct qrtr_hdr_v2 { u8 version; u8 type; u8 flags; u8 optlen; __le32 size; __le16 src_node_id; __le16 src_port_id; __le16 dst_node_id; __le16 dst_port_id; }; #define QRTR_FLAGS_CONFIRM_RX BIT(0) struct qrtr_cb { u32 src_node; u32 src_port; u32 dst_node; u32 dst_port; u8 type; u8 confirm_rx; }; #define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ struct sock sk; struct sockaddr_qrtr us; struct sockaddr_qrtr peer; }; static inline struct qrtr_sock *qrtr_sk(struct sock *sk) { BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0); return container_of(sk, struct qrtr_sock, sk); } static unsigned int qrtr_local_nid = 1; /* for node ids */ static RADIX_TREE(qrtr_nodes, GFP_ATOMIC); static DEFINE_SPINLOCK(qrtr_nodes_lock); /* broadcast list */ static LIST_HEAD(qrtr_all_nodes); /* lock for qrtr_all_nodes and node reference */ static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ static DEFINE_XARRAY_ALLOC(qrtr_ports); /** * struct qrtr_node - endpoint node * @ep_lock: lock for endpoint management and callbacks * @ep: endpoint * @ref: reference count for node * @nid: node id * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port * @qrtr_tx_lock: lock for qrtr_tx_flow inserts * @rx_queue: receive queue * @item: list item for broadcast list */ struct qrtr_node { struct mutex ep_lock; struct qrtr_endpoint *ep; struct kref ref; unsigned int nid; struct radix_tree_root qrtr_tx_flow; struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */ struct sk_buff_head rx_queue; struct list_head item; }; /** * struct qrtr_tx_flow - tx flow control * @resume_tx: waiters for a resume tx from the remote * @pending: number of waiting senders * @tx_failed: indicates that a message with confirm_rx flag was lost */ struct qrtr_tx_flow { struct wait_queue_head resume_tx; int pending; int tx_failed; }; #define QRTR_TX_FLOW_HIGH 10 #define QRTR_TX_FLOW_LOW 5 static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to); static struct qrtr_sock *qrtr_port_lookup(int port); static void qrtr_port_put(struct qrtr_sock *ipc); /* Release node resources and free the node. * * Do not call directly, use qrtr_node_release. To be used with * kref_put_mutex. As such, the node mutex is expected to be locked on call. */ static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); struct radix_tree_iter iter; struct qrtr_tx_flow *flow; unsigned long flags; void __rcu **slot; spin_lock_irqsave(&qrtr_nodes_lock, flags); /* If the node is a bridge for other nodes, there are possibly * multiple entries pointing to our released node, delete them all. */ radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot == node) radix_tree_iter_delete(&qrtr_nodes, &iter, slot); } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); list_del(&node->item); mutex_unlock(&qrtr_node_lock); skb_queue_purge(&node->rx_queue); /* Free tx flow counters */ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); kfree(flow); } kfree(node); } /* Increment reference to node. */ static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node) { if (node) kref_get(&node->ref); return node; } /* Decrement reference to node and release as necessary. */ static void qrtr_node_release(struct qrtr_node *node) { if (!node) return; kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); } /** * qrtr_tx_resume() - reset flow control counter * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on * @skb: resume_tx packet */ static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb) { struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data; u64 remote_node = le32_to_cpu(pkt->client.node); u32 remote_port = le32_to_cpu(pkt->client.port); struct qrtr_tx_flow *flow; unsigned long key; key = remote_node << 32 | remote_port; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock(&flow->resume_tx.lock); flow->pending = 0; spin_unlock(&flow->resume_tx.lock); wake_up_interruptible_all(&flow->resume_tx); } consume_skb(skb); } /** * qrtr_tx_wait() - flow control for outgoing packets * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * @type: type of message * * The flow control scheme is based around the low and high "watermarks". When * the low watermark is passed the confirm_rx flag is set on the outgoing * message, which will trigger the remote to send a control message of the type * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit * further transmision should be paused. * * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure */ static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, int type) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; int confirm_rx = 0; int ret; /* Never set confirm_rx on non-data packets */ if (type != QRTR_TYPE_DATA) return 0; mutex_lock(&node->qrtr_tx_lock); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); if (!flow) { flow = kzalloc(sizeof(*flow), GFP_KERNEL); if (flow) { init_waitqueue_head(&flow->resume_tx); if (radix_tree_insert(&node->qrtr_tx_flow, key, flow)) { kfree(flow); flow = NULL; } } } mutex_unlock(&node->qrtr_tx_lock); /* Set confirm_rx if we where unable to find and allocate a flow */ if (!flow) return 1; spin_lock_irq(&flow->resume_tx.lock); ret = wait_event_interruptible_locked_irq(flow->resume_tx, flow->pending < QRTR_TX_FLOW_HIGH || flow->tx_failed || !node->ep); if (ret < 0) { confirm_rx = ret; } else if (!node->ep) { confirm_rx = -EPIPE; } else if (flow->tx_failed) { flow->tx_failed = 0; confirm_rx = 1; } else { flow->pending++; confirm_rx = flow->pending == QRTR_TX_FLOW_LOW; } spin_unlock_irq(&flow->resume_tx.lock); return confirm_rx; } /** * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed * @node: qrtr_node that the packet is to be send to * @dest_node: node id of the destination * @dest_port: port number of the destination * * Signal that the transmission of a message with confirm_rx flag failed. The * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH, * at which point transmission would stall forever waiting for the resume TX * message associated with the dropped confirm_rx message. * Work around this by marking the flow as having a failed transmission and * cause the next transmission attempt to be sent with the confirm_rx. */ static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node, int dest_port) { unsigned long key = (u64)dest_node << 32 | dest_port; struct qrtr_tx_flow *flow; rcu_read_lock(); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); rcu_read_unlock(); if (flow) { spin_lock_irq(&flow->resume_tx.lock); flow->tx_failed = 1; spin_unlock_irq(&flow->resume_tx.lock); } } /* Pass an outgoing packet socket buffer to the endpoint driver. */ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_hdr_v1 *hdr; size_t len = skb->len; int rc, confirm_rx; confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type); if (confirm_rx < 0) { kfree_skb(skb); return confirm_rx; } hdr = skb_push(skb, sizeof(*hdr)); hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); if (to->sq_port == QRTR_PORT_CTRL) { hdr->dst_node_id = cpu_to_le32(node->nid); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); } else { hdr->dst_node_id = cpu_to_le32(to->sq_node); hdr->dst_port_id = cpu_to_le32(to->sq_port); } hdr->size = cpu_to_le32(len); hdr->confirm_rx = !!confirm_rx; rc = skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr)); if (!rc) { mutex_lock(&node->ep_lock); rc = -ENODEV; if (node->ep) rc = node->ep->xmit(node->ep, skb); else kfree_skb(skb); mutex_unlock(&node->ep_lock); } /* Need to ensure that a subsequent message carries the otherwise lost * confirm_rx flag if we dropped this one */ if (rc && confirm_rx) qrtr_tx_flow_failed(node, to->sq_node, to->sq_port); return rc; } /* Lookup node by id. * * callers must release with qrtr_node_release() */ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) { struct qrtr_node *node; unsigned long flags; mutex_lock(&qrtr_node_lock); spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); spin_unlock_irqrestore(&qrtr_nodes_lock, flags); mutex_unlock(&qrtr_node_lock); return node; } /* Assign node id to node. * * This is mostly useful for automatic node id assignment, based on * the source id in the incoming packet. */ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) { unsigned long flags; if (nid == QRTR_EP_NID_AUTO) return; spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_insert(&qrtr_nodes, nid, node); if (node->nid == QRTR_EP_NID_AUTO) node->nid = nid; spin_unlock_irqrestore(&qrtr_nodes_lock, flags); } /** * qrtr_endpoint_post() - post incoming data * @ep: endpoint handle * @data: data pointer * @len: size of data in bytes * * Return: 0 on success; negative error code on failure */ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; const struct qrtr_hdr_v1 *v1; const struct qrtr_hdr_v2 *v2; struct qrtr_sock *ipc; struct sk_buff *skb; struct qrtr_cb *cb; size_t size; unsigned int ver; size_t hdrlen; if (len == 0 || len & 3) return -EINVAL; skb = __netdev_alloc_skb(NULL, len, GFP_ATOMIC | __GFP_NOWARN); if (!skb) return -ENOMEM; cb = (struct qrtr_cb *)skb->cb; /* Version field in v1 is little endian, so this works for both cases */ ver = *(u8*)data; switch (ver) { case QRTR_PROTO_VER_1: if (len < sizeof(*v1)) goto err; v1 = data; hdrlen = sizeof(*v1); cb->type = le32_to_cpu(v1->type); cb->src_node = le32_to_cpu(v1->src_node_id); cb->src_port = le32_to_cpu(v1->src_port_id); cb->confirm_rx = !!v1->confirm_rx; cb->dst_node = le32_to_cpu(v1->dst_node_id); cb->dst_port = le32_to_cpu(v1->dst_port_id); size = le32_to_cpu(v1->size); break; case QRTR_PROTO_VER_2: if (len < sizeof(*v2)) goto err; v2 = data; hdrlen = sizeof(*v2) + v2->optlen; cb->type = v2->type; cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); cb->src_node = le16_to_cpu(v2->src_node_id); cb->src_port = le16_to_cpu(v2->src_port_id); cb->dst_node = le16_to_cpu(v2->dst_node_id); cb->dst_port = le16_to_cpu(v2->dst_port_id); if (cb->src_port == (u16)QRTR_PORT_CTRL) cb->src_port = QRTR_PORT_CTRL; if (cb->dst_port == (u16)QRTR_PORT_CTRL) cb->dst_port = QRTR_PORT_CTRL; size = le32_to_cpu(v2->size); break; default: pr_err("qrtr: Invalid version %d\n", ver); goto err; } if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) cb->dst_port = QRTR_PORT_CTRL; if (!size || len != ALIGN(size, 4) + hdrlen) goto err; if ((cb->type == QRTR_TYPE_NEW_SERVER || cb->type == QRTR_TYPE_RESUME_TX) && size < sizeof(struct qrtr_ctrl_pkt)) goto err; if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && cb->type != QRTR_TYPE_RESUME_TX) goto err; skb_put_data(skb, data + hdrlen, size); qrtr_node_assign(node, cb->src_node); if (cb->type == QRTR_TYPE_NEW_SERVER) { /* Remote node endpoint can bridge other distant nodes */ const struct qrtr_ctrl_pkt *pkt; pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } if (cb->type == QRTR_TYPE_RESUME_TX) { qrtr_tx_resume(node, skb); } else { ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) goto err; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); goto err; } qrtr_port_put(ipc); } return 0; err: kfree_skb(skb); return -EINVAL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_post); /** * qrtr_alloc_ctrl_packet() - allocate control packet skb * @pkt: reference to qrtr_ctrl_pkt pointer * @flags: the type of memory to allocate * * Returns newly allocated sk_buff, or NULL on failure * * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and * on success returns a reference to the control packet in @pkt. */ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt, gfp_t flags) { const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, flags); if (!skb) return NULL; skb_reserve(skb, QRTR_HDR_MAX_SIZE); *pkt = skb_put_zero(skb, pkt_len); return skb; } /** * qrtr_endpoint_register() - register a new endpoint * @ep: endpoint to register * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment * Return: 0 on success; negative error code on failure * * The specified endpoint must have the xmit function pointer set on call. */ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) { struct qrtr_node *node; if (!ep || !ep->xmit) return -EINVAL; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; kref_init(&node->ref); mutex_init(&node->ep_lock); skb_queue_head_init(&node->rx_queue); node->nid = QRTR_EP_NID_AUTO; node->ep = ep; INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL); mutex_init(&node->qrtr_tx_lock); qrtr_node_assign(node, nid); mutex_lock(&qrtr_node_lock); list_add(&node->item, &qrtr_all_nodes); mutex_unlock(&qrtr_node_lock); ep->node = node; return 0; } EXPORT_SYMBOL_GPL(qrtr_endpoint_register); /** * qrtr_endpoint_unregister - unregister endpoint * @ep: endpoint to unregister */ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; struct radix_tree_iter iter; struct qrtr_ctrl_pkt *pkt; struct qrtr_tx_flow *flow; struct sk_buff *skb; unsigned long flags; void __rcu **slot; mutex_lock(&node->ep_lock); node->ep = NULL; mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ spin_lock_irqsave(&qrtr_nodes_lock, flags); radix_tree_for_each_slot(slot, &qrtr_nodes, &iter, 0) { if (*slot != node) continue; src.sq_node = iter.index; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_ATOMIC); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); } } spin_unlock_irqrestore(&qrtr_nodes_lock, flags); /* Wake up any transmitters waiting for resume-tx from the node */ mutex_lock(&node->qrtr_tx_lock); radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { flow = *slot; wake_up_interruptible_all(&flow->resume_tx); } mutex_unlock(&node->qrtr_tx_lock); qrtr_node_release(node); ep->node = NULL; } EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister); /* Lookup socket by port. * * Callers must release with qrtr_port_put() */ static struct qrtr_sock *qrtr_port_lookup(int port) { struct qrtr_sock *ipc; if (port == QRTR_PORT_CTRL) port = 0; rcu_read_lock(); ipc = xa_load(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); rcu_read_unlock(); return ipc; } /* Release acquired socket. */ static void qrtr_port_put(struct qrtr_sock *ipc) { sock_put(&ipc->sk); } /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; struct sockaddr_qrtr to; to.sq_family = AF_QIPCRTR; to.sq_node = QRTR_NODE_BCAST; to.sq_port = QRTR_PORT_CTRL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (skb) { pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt->client.node = cpu_to_le32(ipc->us.sq_node); pkt->client.port = cpu_to_le32(ipc->us.sq_port); skb_set_owner_w(skb, &ipc->sk); qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, &to); } if (port == QRTR_PORT_CTRL) port = 0; __sock_put(&ipc->sk); xa_erase(&qrtr_ports, port); /* Ensure that if qrtr_port_lookup() did enter the RCU read section we * wait for it to up increment the refcount */ synchronize_rcu(); } /* Assign port number to socket. * * Specify port in the integer pointed to by port, and it will be adjusted * on return as necesssary. * * Port may be: * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET] * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN * >QRTR_MIN_EPH_SOCKET: Specified; available to all */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { int rc; if (!*port) { rc = xa_alloc(&qrtr_ports, port, ipc, QRTR_EPH_PORT_RANGE, GFP_KERNEL); } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { rc = xa_insert(&qrtr_ports, 0, ipc, GFP_KERNEL); } else { rc = xa_insert(&qrtr_ports, *port, ipc, GFP_KERNEL); } if (rc == -EBUSY) return -EADDRINUSE; else if (rc < 0) return rc; sock_hold(&ipc->sk); return 0; } /* Reset all non-control ports */ static void qrtr_reset_ports(void) { struct qrtr_sock *ipc; unsigned long index; rcu_read_lock(); xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; sk_error_report(&ipc->sk); sock_put(&ipc->sk); } rcu_read_unlock(); } /* Bind socket to address. * * Socket should be locked upon call. */ static int __qrtr_bind(struct socket *sock, const struct sockaddr_qrtr *addr, int zapped) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int port; int rc; /* rebinding ok */ if (!zapped && addr->sq_port == ipc->us.sq_port) return 0; port = addr->sq_port; rc = qrtr_port_assign(ipc, &port); if (rc) return rc; /* unbind previous, if any */ if (!zapped) qrtr_port_remove(ipc); ipc->us.sq_port = port; sock_reset_flag(sk, SOCK_ZAPPED); /* Notify all open ports about the new controller */ if (port == QRTR_PORT_CTRL) qrtr_reset_ports(); return 0; } /* Auto bind to an ephemeral port. */ static int qrtr_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct sockaddr_qrtr addr; if (!sock_flag(sk, SOCK_ZAPPED)) return 0; addr.sq_family = AF_QIPCRTR; addr.sq_node = qrtr_local_nid; addr.sq_port = 0; return __qrtr_bind(sock, &addr, 1); } /* Bind socket to specified sockaddr. */ static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; if (addr->sq_node != ipc->us.sq_node) return -EINVAL; lock_sock(sk); rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED)); release_sock(sk); return rc; } /* Queue packet to local peer socket. */ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct qrtr_sock *ipc; struct qrtr_cb *cb; ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ if (ipc) qrtr_port_put(ipc); kfree_skb(skb); return -ENODEV; } cb = (struct qrtr_cb *)skb->cb; cb->src_node = from->sq_node; cb->src_port = from->sq_port; if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); return -ENOSPC; } qrtr_port_put(ipc); return 0; } /* Queue packet for broadcast. */ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, int type, struct sockaddr_qrtr *from, struct sockaddr_qrtr *to) { struct sk_buff *skbn; mutex_lock(&qrtr_node_lock); list_for_each_entry(node, &qrtr_all_nodes, item) { skbn = pskb_copy(skb, GFP_KERNEL); if (!skbn) break; skb_set_owner_w(skbn, skb->sk); qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); qrtr_local_enqueue(NULL, skb, type, from, to); return 0; } static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, struct sockaddr_qrtr *, struct sockaddr_qrtr *); __le32 qrtr_type = cpu_to_le32(QRTR_TYPE_DATA); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; struct sk_buff *skb; size_t plen; u32 type; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) return -EINVAL; if (len > 65535) return -EMSGSIZE; lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } if (addr->sq_family != AF_QIPCRTR) { release_sock(sk); return -EINVAL; } rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } } else if (sk->sk_state == TCP_ESTABLISHED) { addr = &ipc->peer; } else { release_sock(sk); return -ENOTCONN; } node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { if (addr->sq_port != QRTR_PORT_CTRL && qrtr_local_nid != QRTR_NODE_BCAST) { release_sock(sk); return -ENOTCONN; } enqueue_fn = qrtr_bcast_enqueue; } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { node = qrtr_node_lookup(addr->sq_node); if (!node) { release_sock(sk); return -ECONNRESET; } enqueue_fn = qrtr_node_enqueue; } plen = (len + 3) & ~3; skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) { rc = -ENOMEM; goto out_node; } skb_reserve(skb, QRTR_HDR_MAX_SIZE); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; kfree_skb(skb); goto out_node; } /* control messages already require the type as 'command' */ skb_copy_bits(skb, 0, &qrtr_type, 4); } type = le32_to_cpu(qrtr_type); rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; out_node: qrtr_node_release(node); release_sock(sk); return rc; } static int qrtr_send_resume_tx(struct qrtr_cb *cb) { struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port }; struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port }; struct qrtr_ctrl_pkt *pkt; struct qrtr_node *node; struct sk_buff *skb; int ret; node = qrtr_node_lookup(remote.sq_node); if (!node) return -EINVAL; skb = qrtr_alloc_ctrl_packet(&pkt, GFP_KERNEL); if (!skb) return -ENOMEM; pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); pkt->client.node = cpu_to_le32(cb->dst_node); pkt->client.port = cpu_to_le32(cb->dst_port); ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote); qrtr_node_release(node); return ret; } static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); struct sock *sk = sock->sk; struct sk_buff *skb; struct qrtr_cb *cb; int copied, rc; lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { release_sock(sk); return -EADDRNOTAVAIL; } skb = skb_recv_datagram(sk, flags, &rc); if (!skb) { release_sock(sk); return rc; } cb = (struct qrtr_cb *)skb->cb; copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { /* There is an anonymous 2-byte hole after sq_family, * make sure to clear it. */ memset(addr, 0, sizeof(*addr)); addr->sq_family = AF_QIPCRTR; addr->sq_node = cb->src_node; addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } out: if (cb->confirm_rx) qrtr_send_resume_tx(cb); skb_free_datagram(sk, skb); release_sock(sk); return rc; } static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; int rc; if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) return -EINVAL; lock_sock(sk); sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = qrtr_autobind(sock); if (rc) { release_sock(sk); return rc; } ipc->peer = *addr; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; release_sock(sk); return 0; } static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, int peer) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sockaddr_qrtr qaddr; struct sock *sk = sock->sk; lock_sock(sk); if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); return -ENOTCONN; } qaddr = ipc->peer; } else { qaddr = ipc->us; } release_sock(sk); qaddr.sq_family = AF_QIPCRTR; memcpy(saddr, &qaddr, sizeof(qaddr)); return sizeof(qaddr); } static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct sockaddr_qrtr *sq; struct sk_buff *skb; struct ifreq ifr; long len = 0; int rc = 0; lock_sock(sk); switch (cmd) { case TIOCOUTQ: len = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (len < 0) len = 0; rc = put_user(len, (int __user *)argp); break; case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } break; case SIOCADDRT: case SIOCDELRT: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: rc = -EINVAL; break; default: rc = -ENOIOCTLCMD; break; } release_sock(sk); return rc; } static int qrtr_release(struct socket *sock) { struct sock *sk = sock->sk; struct qrtr_sock *ipc; if (!sk) return 0; lock_sock(sk); ipc = qrtr_sk(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_orphan(sk); sock->sk = NULL; if (!sock_flag(sk, SOCK_ZAPPED)) qrtr_port_remove(ipc); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static const struct proto_ops qrtr_proto_ops = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .bind = qrtr_bind, .connect = qrtr_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .listen = sock_no_listen, .sendmsg = qrtr_sendmsg, .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, .release = qrtr_release, .mmap = sock_no_mmap, }; static struct proto qrtr_proto = { .name = "QIPCRTR", .owner = THIS_MODULE, .obj_size = sizeof(struct qrtr_sock), }; static int qrtr_create(struct net *net, struct socket *sock, int protocol, int kern) { struct qrtr_sock *ipc; struct sock *sk; if (sock->type != SOCK_DGRAM) return -EPROTOTYPE; sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern); if (!sk) return -ENOMEM; sock_set_flag(sk, SOCK_ZAPPED); sock_init_data(sock, sk); sock->ops = &qrtr_proto_ops; ipc = qrtr_sk(sk); ipc->us.sq_family = AF_QIPCRTR; ipc->us.sq_node = qrtr_local_nid; ipc->us.sq_port = 0; return 0; } static const struct net_proto_family qrtr_family = { .owner = THIS_MODULE, .family = AF_QIPCRTR, .create = qrtr_create, }; static int __init qrtr_proto_init(void) { int rc; rc = proto_register(&qrtr_proto, 1); if (rc) return rc; rc = sock_register(&qrtr_family); if (rc) goto err_proto; rc = qrtr_ns_init(); if (rc) goto err_sock; return 0; err_sock: sock_unregister(qrtr_family.family); err_proto: proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); static void __exit qrtr_proto_fini(void) { qrtr_ns_remove(); sock_unregister(qrtr_family.family); proto_unregister(&qrtr_proto); } module_exit(qrtr_proto_fini); MODULE_DESCRIPTION("Qualcomm IPC-router driver"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_NETPROTO(PF_QIPCRTR); |
| 84 85 5 1 1 1 2 5 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 | // SPDX-License-Identifier: GPL-2.0 /* * Filesystem-level keyring for fscrypt * * Copyright 2019 Google LLC */ /* * This file implements management of fscrypt master keys in the * filesystem-level keyring, including the ioctls: * * - FS_IOC_ADD_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS * - FS_IOC_GET_ENCRYPTION_KEY_STATUS * * See the "User API" section of Documentation/filesystems/fscrypt.rst for more * information about these ioctls. */ #include <crypto/skcipher.h> #include <linux/export.h> #include <linux/key-type.h> #include <linux/once.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/unaligned.h> #include "fscrypt_private.h" /* The master encryption keys for a filesystem (->s_master_keys) */ struct fscrypt_keyring { /* * Lock that protects ->key_hashtable. It does *not* protect the * fscrypt_master_key structs themselves. */ spinlock_t lock; /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ struct hlist_head key_hashtable[128]; }; static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { memzero_explicit(secret, sizeof(*secret)); } static void move_master_key_secret(struct fscrypt_master_key_secret *dst, struct fscrypt_master_key_secret *src) { memcpy(dst, src, sizeof(*dst)); memzero_explicit(src, sizeof(*src)); } static void fscrypt_free_master_key(struct rcu_head *head) { struct fscrypt_master_key *mk = container_of(head, struct fscrypt_master_key, mk_rcu_head); /* * The master key secret and any embedded subkeys should have already * been wiped when the last active reference to the fscrypt_master_key * struct was dropped; doing it here would be unnecessarily late. * Nevertheless, use kfree_sensitive() in case anything was missed. */ kfree_sensitive(mk); } void fscrypt_put_master_key(struct fscrypt_master_key *mk) { if (!refcount_dec_and_test(&mk->mk_struct_refs)) return; /* * No structural references left, so free ->mk_users, and also free the * fscrypt_master_key struct itself after an RCU grace period ensures * that concurrent keyring lookups can no longer find it. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0); if (mk->mk_users) { /* Clear the keyring so the quota gets released right away. */ keyring_clear(mk->mk_users); key_put(mk->mk_users); mk->mk_users = NULL; } call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk) { size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) return; /* * No active references left, so complete the full removal of this * fscrypt_master_key struct by removing it from the keyring and * destroying any subkeys embedded in it. */ if (WARN_ON_ONCE(!sb->s_master_keys)) return; spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_present is false and * ->mk_decrypted_inodes is empty. */ WARN_ON_ONCE(mk->mk_present); WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { fscrypt_destroy_prepared_key( sb, &mk->mk_direct_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_64_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_32_keys[i]); } memzero_explicit(&mk->mk_ino_hash_key, sizeof(mk->mk_ino_hash_key)); mk->mk_ino_hash_key_initialized = false; /* Drop the structural ref associated with the active refs. */ fscrypt_put_master_key(mk); } /* * This transitions the key state from present to incompletely removed, and then * potentially to absent (depending on whether inodes remain). */ static void fscrypt_initiate_key_removal(struct super_block *sb, struct fscrypt_master_key *mk) { WRITE_ONCE(mk->mk_present, false); wipe_master_key_secret(&mk->mk_secret); fscrypt_put_master_key_activeref(sb, mk); } static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) { if (spec->__reserved) return false; return master_key_spec_len(spec) != 0; } static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { /* * We just charge FSCRYPT_MAX_RAW_KEY_SIZE bytes to the user's key quota * for each key, regardless of the exact key size. The amount of memory * actually used is greater than the size of the raw key anyway. */ return key_payload_reserve(key, FSCRYPT_MAX_RAW_KEY_SIZE); } static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); } /* * Type of key in ->mk_users. Each key of this type represents a particular * user who has added a particular master key. * * Note that the name of this key type really should be something like * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen * mainly for simplicity of presentation in /proc/keys when read by a non-root * user. And it is expected to be rare that a key is actually added by multiple * users, since users should keep their encryption keys confidential. */ static struct key_type key_type_fscrypt_user = { .name = ".fscrypt", .instantiate = fscrypt_user_key_instantiate, .describe = fscrypt_user_key_describe, }; #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "fscrypt-%*phN-users", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); } static void format_mk_user_description( char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier, __kuid_val(current_fsuid())); } /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); if (!keyring) return -ENOMEM; spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. */ smp_store_release(&sb->s_master_keys, keyring); return 0; } /* * Release all encryption keys that have been added to the filesystem, along * with the keyring that contains them. * * This is called at unmount time, after all potentially-encrypted inodes have * been evicted. The filesystem's underlying block device(s) are still * available at this time; this is important because after user file accesses * have been allowed, this function may need to evict keys from the keyslots of * an inline crypto engine, which requires the block device(s). */ void fscrypt_destroy_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!keyring) return; for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { struct hlist_head *bucket = &keyring->key_hashtable[i]; struct fscrypt_master_key *mk; struct hlist_node *tmp; hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { /* * Since all potentially-encrypted inodes were already * evicted, every key remaining in the keyring should * have an empty inode list, and should only still be in * the keyring due to the single active ref associated * with ->mk_present. There should be no structural * refs beyond the one associated with the active ref. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON_ONCE(!mk->mk_present); fscrypt_initiate_key_removal(sb, mk); } } kfree_sensitive(keyring); sb->s_master_keys = NULL; } static struct hlist_head * fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, const struct fscrypt_key_specifier *mk_spec) { /* * Since key specifiers should be "random" values, it is sufficient to * use a trivial hash function that just takes the first several bits of * the key specifier. */ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; } /* * Find the specified master key struct in ->s_master_keys and take a structural * ref to it. The structural ref guarantees that the key struct continues to * exist, but it does *not* guarantee that ->s_master_keys continues to contain * the key struct. The structural ref needs to be dropped by * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring; struct hlist_head *bucket; struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). * I.e., another task can publish ->s_master_keys concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) return NULL; /* No keyring yet, so no keys yet. */ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); rcu_read_lock(); switch (mk_spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && memcmp(mk->mk_spec.u.descriptor, mk_spec->u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && memcmp(mk->mk_spec.u.identifier, mk_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; } mk = NULL; out: rcu_read_unlock(); return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE]; struct key *keyring; format_mk_users_keyring_description(description, mk->mk_spec.u.identifier); keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); mk->mk_users = keyring; return 0; } /* * Find the current user's "key" in the master key's ->mk_users. * Returns ERR_PTR(-ENOKEY) if not found. */ static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); /* * We need to mark the keyring reference as "possessed" so that we * acquire permission to search it, via the KEY_POS_SEARCH permission. */ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), &key_type_fscrypt_user, description, false); if (IS_ERR(keyref)) { if (PTR_ERR(keyref) == -EAGAIN || /* not found */ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ keyref = ERR_PTR(-ENOKEY); return ERR_CAST(keyref); } return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be * removed by another user with the key. Either ->mk_sem must be held for * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; struct key *mk_user; int err; format_mk_user_description(description, mk->mk_spec.u.identifier); mk_user = key_alloc(&key_type_fscrypt_user, description, current_fsuid(), current_gid(), current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); key_put(mk_user); return err; } /* * Remove the current user's "key" from ->mk_users. * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ static int remove_master_key_user(struct fscrypt_master_key *mk) { struct key *mk_user; int err; mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_unlink(mk->mk_users, mk_user); key_put(mk_user); return err; } /* * Allocate a new fscrypt_master_key, transfer the given secret over to it, and * insert it into sb->s_master_keys. */ static int add_new_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) goto out_put; err = add_master_key_user(mk); if (err) goto out_put; } move_master_key_secret(&mk->mk_secret, secret); mk->mk_present = true; refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */ spin_lock(&keyring->lock); hlist_add_head_rcu(&mk->mk_node, fscrypt_mk_hash_bucket(keyring, mk_spec)); spin_unlock(&keyring->lock); return 0; out_put: fscrypt_put_master_key(mk); return err; } #define KEY_DEAD 1 static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { int err; /* * If the current user is already in ->mk_users, then there's nothing to * do. Otherwise, we need to add the user to ->mk_users. (Neither is * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { struct key *mk_user = find_master_key_user(mk); if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } err = add_master_key_user(mk); if (err) return err; } /* If the key is incompletely removed, make it present again. */ if (!mk->mk_present) { if (!refcount_inc_not_zero(&mk->mk_active_refs)) { /* * Raced with the last active ref being dropped, so the * key has become, or is about to become, "absent". * Therefore, we need to allocate a new key struct. */ return KEY_DEAD; } move_master_key_secret(&mk->mk_secret, secret); WRITE_ONCE(mk->mk_present, true); } return 0; } static int do_add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ mk = fscrypt_find_master_key(sb, mk_spec); if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); if (!err) err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Add the user to ->mk_users * if needed, and make the key "present" again if possible. */ down_write(&mk->mk_sem); err = add_existing_master_key(mk, secret); up_write(&mk->mk_sem); if (err == KEY_DEAD) { /* * We found a key struct, but it's already been fully * removed. Ignore the old struct and add a new one. * fscrypt_add_key_mutex means we don't need to worry * about concurrent adds. */ err = add_new_master_key(sb, secret, mk_spec); } fscrypt_put_master_key(mk); } mutex_unlock(&fscrypt_add_key_mutex); return err; } static int add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, struct fscrypt_key_specifier *key_spec) { int err; if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]; u8 *kdf_key = secret->bytes; unsigned int kdf_key_size = secret->size; u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY; /* * For raw keys, the fscrypt master key is used directly as the * fscrypt KDF key. For hardware-wrapped keys, we have to pass * the master key to the hardware to derive the KDF key, which * is then only used to derive non-file-contents subkeys. */ if (secret->is_hw_wrapped) { err = fscrypt_derive_sw_secret(sb, secret->bytes, secret->size, sw_secret); if (err) return err; kdf_key = sw_secret; kdf_key_size = sizeof(sw_secret); /* * To avoid weird behavior if someone manages to * determine sw_secret and add it as a raw key, ensure * that hardware-wrapped keys and raw keys will have * different key identifiers by deriving their key * identifiers using different KDF contexts. */ keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; } fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); /* * Now that the KDF context is initialized, the raw KDF key is * no longer needed. */ memzero_explicit(kdf_key, kdf_key_size); /* Calculate the key identifier */ fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, key_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); } return do_add_master_key(sb, secret, key_spec); } /* * Validate the size of an fscrypt master key being added. Note that this is * just an initial check, as we don't know which ciphers will be used yet. * There is a stricter size check later when the key is actually used by a file. */ static inline bool fscrypt_valid_key_size(size_t size, u32 add_key_flags) { u32 max_size = (add_key_flags & FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) ? FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE : FSCRYPT_MAX_RAW_KEY_SIZE; return size >= FSCRYPT_MIN_KEY_SIZE && size <= max_size; } static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; if (prep->datalen < sizeof(*payload)) return -EINVAL; if (!fscrypt_valid_key_size(prep->datalen - sizeof(*payload), payload->flags)) return -EINVAL; if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; if (payload->flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); if (!prep->payload.data[0]) return -ENOMEM; prep->quotalen = prep->datalen; return 0; } static void fscrypt_provisioning_key_free_preparse( struct key_preparsed_payload *prep) { kfree_sensitive(prep->payload.data[0]); } static void fscrypt_provisioning_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); if (key_is_positive(key)) { const struct fscrypt_provisioning_key_payload *payload = key->payload.data[0]; seq_printf(m, ": %u [%u]", key->datalen, payload->type); } } static void fscrypt_provisioning_key_destroy(struct key *key) { kfree_sensitive(key->payload.data[0]); } static struct key_type key_type_fscrypt_provisioning = { .name = "fscrypt-provisioning", .preparse = fscrypt_provisioning_key_preparse, .free_preparse = fscrypt_provisioning_key_free_preparse, .instantiate = generic_key_instantiate, .describe = fscrypt_provisioning_key_describe, .destroy = fscrypt_provisioning_key_destroy, }; /* * Retrieve the key from the Linux keyring key specified by 'key_id', and store * it into 'secret'. * * The key must be of type "fscrypt-provisioning" and must have the 'type' and * 'flags' field of the payload set to the given values, indicating that the key * is intended for use for the specified purpose. We don't use the "logon" key * type because there's no way to completely restrict the use of such keys; they * can be used by any kernel API that accepts "logon" keys and doesn't require a * specific service prefix. * * The ability to specify the key via Linux keyring key is intended for cases * where userspace needs to re-add keys after the filesystem is unmounted and * re-mounted. Most users should just provide the key directly instead. */ static int get_keyring_key(u32 key_id, u32 type, u32 flags, struct fscrypt_master_key_secret *secret) { key_ref_t ref; struct key *key; const struct fscrypt_provisioning_key_payload *payload; int err; ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH); if (IS_ERR(ref)) return PTR_ERR(ref); key = key_ref_to_ptr(ref); if (key->type != &key_type_fscrypt_provisioning) goto bad_key; payload = key->payload.data[0]; /* * Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. * Similarly, don't allow hardware-wrapped keys to be used as * non-hardware-wrapped keys and vice versa. */ if (payload->type != type || payload->flags != flags) goto bad_key; secret->size = key->datalen - sizeof(*payload); memcpy(secret->bytes, payload->raw, secret->size); err = 0; goto out_put; bad_key: err = -EKEYREJECTED; out_put: key_ref_put(ref); return err; } /* * Add a master encryption key to the filesystem, causing all files which were * encrypted with it to appear "unlocked" (decrypted) when accessed. * * When adding a key for use by v1 encryption policies, this ioctl is * privileged, and userspace must provide the 'key_descriptor'. * * When adding a key for use by v2+ encryption policies, this ioctl is * unprivileged. This is needed, in general, to allow non-root users to use * encryption without encountering the visibility problems of process-subscribed * keyrings and the inability to properly remove keys. This works by having * each key identified by its cryptographically secure hash --- the * 'key_identifier'. The cryptographic hash ensures that a malicious user * cannot add the wrong key for a given identifier. Furthermore, each added key * is charged to the appropriate user's quota for the keyrings service, which * prevents a malicious user from adding too many keys. Finally, we forbid a * user from removing a key while other users have added it too, which prevents * a user who knows another user's key from causing a denial-of-service by * removing it at an inopportune time. (We tolerate that a user who knows a key * can prevent other users from removing it.) * * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_add_key_arg __user *uarg = _uarg; struct fscrypt_add_key_arg arg; struct fscrypt_master_key_secret secret; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add keys that are identified by an arbitrary descriptor * rather than by a cryptographic hash --- since otherwise a malicious * user could add the wrong key. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; memset(&secret, 0, sizeof(secret)); if (arg.flags) { if (arg.flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; secret.is_hw_wrapped = true; } if (arg.key_id) { if (arg.raw_size != 0) return -EINVAL; err = get_keyring_key(arg.key_id, arg.key_spec.type, arg.flags, &secret); if (err) goto out_wipe_secret; } else { if (!fscrypt_valid_key_size(arg.raw_size, arg.flags)) return -EINVAL; secret.size = arg.raw_size; err = -EFAULT; if (copy_from_user(secret.bytes, uarg->raw, secret.size)) goto out_wipe_secret; } err = add_master_key(sb, &secret, &arg.key_spec); if (err) goto out_wipe_secret; /* Return the key identifier to userspace, if applicable */ err = -EFAULT; if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE)) goto out_wipe_secret; err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); static void fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { static u8 test_key[FSCRYPT_MAX_RAW_KEY_SIZE]; get_random_once(test_key, sizeof(test_key)); memset(secret, 0, sizeof(*secret)); secret->size = sizeof(test_key); memcpy(secret->bytes, test_key, sizeof(test_key)); } void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; fscrypt_get_test_dummy_secret(&secret); fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); wipe_master_key_secret(&secret); } /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to * @key_spec: the key specifier of the test dummy encryption key * * Add the key for the test_dummy_encryption mount option to the filesystem. To * prevent misuse of this mount option, a per-boot random key is used instead of * a hardcoded one. This makes it so that any encrypted files created using * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec) { struct fscrypt_master_key_secret secret; int err; fscrypt_get_test_dummy_secret(&secret); err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } /* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. * Cryptographically this isn't much of a problem, but the semantics of this * would be a bit weird, so it's best to just forbid it. * * The system administrator (CAP_FOWNER) can override this, which should be * enough for any use cases where encryption policies are being set using keys * that were chosen ahead of time but aren't available at the moment. * * Note that the key may have already removed by the time this returns, but * that's okay; we just care whether the key was there at some point. * * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code */ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); mk = fscrypt_find_master_key(sb, &mk_spec); if (!mk) { err = -ENOKEY; goto out; } down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); } else { key_put(mk_user); err = 0; } up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; return err; } /* * Try to evict the inode's dentries from the dentry cache. If the inode is a * directory, then it can have at most one dentry; however, that dentry may be * pinned by child dentries, so first try to evict the children too. */ static void shrink_dcache_inode(struct inode *inode) { struct dentry *dentry; if (S_ISDIR(inode->i_mode)) { dentry = d_find_any_alias(inode); if (dentry) { shrink_dcache_parent(dentry); dput(dentry); } } d_prune_aliases(inode); } static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) { struct fscrypt_inode_info *ci; struct inode *inode; struct inode *toput_inode = NULL; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&mk->mk_decrypted_inodes_lock); shrink_dcache_inode(inode); iput(toput_inode); toput_inode = inode; spin_lock(&mk->mk_decrypted_inodes_lock); } spin_unlock(&mk->mk_decrypted_inodes_lock); iput(toput_inode); } static int check_for_busy_inodes(struct super_block *sb, struct fscrypt_master_key *mk) { struct list_head *pos; size_t busy_count = 0; unsigned long ino; char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each(pos, &mk->mk_decrypted_inodes) busy_count++; if (busy_count == 0) { spin_unlock(&mk->mk_decrypted_inodes_lock); return 0; } { /* select an example file to show for debugging purposes */ struct inode *inode = list_first_entry(&mk->mk_decrypted_inodes, struct fscrypt_inode_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; } spin_unlock(&mk->mk_decrypted_inodes_lock); /* If the inode is currently being created, ino may still be 0. */ if (ino) snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); fscrypt_warn(NULL, "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, ino_str); return -EBUSY; } static int try_to_lock_encrypted_files(struct super_block *sb, struct fscrypt_master_key *mk) { int err1; int err2; /* * An inode can't be evicted while it is dirty or has dirty pages. * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. * * Just do it the easy way: call sync_filesystem(). It's overkill, but * it works, and it's more important to minimize the amount of caches we * drop than the amount of data we sync. Also, unprivileged users can * already call sync_filesystem() via sys_syncfs() or sys_sync(). */ down_read(&sb->s_umount); err1 = sync_filesystem(sb); up_read(&sb->s_umount); /* If a sync error occurs, still try to evict as much as possible. */ /* * Inodes are pinned by their dentries, so we have to evict their * dentries. shrink_dcache_sb() would suffice, but would be overkill * and inappropriate for use by unprivileged users. So instead go * through the inodes' alias lists and try to evict each dentry. */ evict_dentries_for_decrypted_inodes(mk); /* * evict_dentries_for_decrypted_inodes() already iput() each inode in * the list; any inodes for which that dropped the last reference will * have been evicted due to fscrypt_drop_inode() detecting the key * removal and telling the VFS to evict the inode. So to finish, we * just need to check whether any inodes couldn't be evicted. */ err2 = check_for_busy_inodes(sb, mk); return err1 ?: err2; } /* * Try to remove an fscrypt master encryption key. * * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's * claim to the key, then removes the key itself if no other users have claims. * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the * key itself. * * To "remove the key itself", first we transition the key to the "incompletely * removed" state, so that no more inodes can be unlocked with it. Then we try * to evict all cached inodes that had been unlocked with the key. * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" * state where it tracks the list of remaining inodes. Userspace can execute * the ioctl again later to retry eviction, or alternatively can re-add the key. * * For more details, see the "Removing keys" section of * Documentation/filesystems/fscrypt.rst. */ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add and remove keys that are identified by an arbitrary * descriptor rather than by a cryptographic hash. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; /* Find the key being removed. */ mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) return -ENOKEY; down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { if (all_users) err = keyring_clear(mk->mk_users); else err = remove_master_key_user(mk); if (err) { up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { /* * Other users have still added the key too. We removed * the current user's claim to the key, but we still * can't remove the key itself. */ status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Initiate removal of the key. */ err = -ENOKEY; if (mk->mk_present) { fscrypt_initiate_key_removal(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; up_write(&mk->mk_sem); if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; err = 0; } } /* * We return 0 if we successfully did something: removed a claim to the * key, initiated removal of the key, or tried locking the files again. * Users need to check the informational status flags if they care * whether the key has been fully removed including all files locked. */ out_put_key: fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; } int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg) { return do_remove_key(filp, uarg, false); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; return do_remove_key(filp, uarg, true); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); /* * Retrieve the status of an fscrypt master encryption key. * * We set ->status to indicate whether the key is absent, present, or * incompletely removed. (For an explanation of what these statuses mean and * how they are represented internally, see struct fscrypt_master_key.) This * field allows applications to easily determine the status of an encrypted * directory without using a hack such as trying to open a regular file in it * (which can confuse the "incompletely removed" status with absent or present). * * In addition, for v2 policy keys we allow applications to determine, via * ->status_flags and ->user_count, whether the key has been added by the * current user, by other users, or by both. Most applications should not need * this, since ordinarily only one user should know a given key. However, if a * secret key is shared by multiple users, applications may wish to add an * already-present key to prevent other users from removing it. This ioctl can * be used to check whether that really is the case before the work is done to * add the key --- which might e.g. require prompting the user for a passphrase. * * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; struct fscrypt_master_key *mk; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; arg.status_flags = 0; arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } down_read(&mk->mk_sem); if (!mk->mk_present) { arg.status = refcount_read(&mk->mk_active_refs) > 0 ? FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } arg.status = FSCRYPT_KEY_STATUS_PRESENT; if (mk->mk_users) { struct key *mk_user; arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; mk_user = find_master_key_user(mk); if (!IS_ERR(mk_user)) { arg.status_flags |= FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; key_put(mk_user); } else if (mk_user != ERR_PTR(-ENOKEY)) { err = PTR_ERR(mk_user); goto out_release_key; } } err = 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status); int __init fscrypt_init_keyring(void) { int err; err = register_key_type(&key_type_fscrypt_user); if (err) return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) goto err_unregister_fscrypt_user; return 0; err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); return err; } |
| 381 381 13 1 1 1 12 2 11 12 11 12 23 23 11 11 11 11 28 2 26 2 25 412 412 382 1 380 408 408 408 409 406 405 26 1 25 25 23 25 13 7 25 25 18 16 23 19 25 411 411 410 387 388 409 31 409 405 403 383 382 383 383 383 412 35 412 381 412 412 381 412 412 412 382 371 412 373 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Information interface for ALSA driver * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <linux/utsname.h> #include <linux/proc_fs.h> #include <linux/mutex.h> int snd_info_check_reserved_words(const char *str) { static const char * const reserved[] = { "version", "meminfo", "memdebug", "detect", "devices", "oss", "cards", "timers", "synth", "pcm", "seq", NULL }; const char * const *xstr = reserved; while (*xstr) { if (!strcmp(*xstr, str)) return 0; xstr++; } if (!strncmp(str, "card", 4)) return 0; return 1; } static DEFINE_MUTEX(info_mutex); struct snd_info_private_data { struct snd_info_buffer *rbuffer; struct snd_info_buffer *wbuffer; struct snd_info_entry *entry; void *file_private_data; }; static int snd_info_version_init(void); static void snd_info_clear_entries(struct snd_info_entry *entry); /* */ static struct snd_info_entry *snd_proc_root; struct snd_info_entry *snd_seq_root; EXPORT_SYMBOL(snd_seq_root); #ifdef CONFIG_SND_OSSEMUL struct snd_info_entry *snd_oss_root; #endif static int alloc_info_private(struct snd_info_entry *entry, struct snd_info_private_data **ret) { struct snd_info_private_data *data; if (!entry || !entry->p) return -ENODEV; if (!try_module_get(entry->module)) return -EFAULT; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) { module_put(entry->module); return -ENOMEM; } data->entry = entry; *ret = data; return 0; } static bool valid_pos(loff_t pos, size_t count) { if (pos < 0 || (long) pos != pos || (ssize_t) count < 0) return false; if ((unsigned long) pos + (unsigned long) count < (unsigned long) pos) return false; return true; } /* * file ops for binary proc files */ static loff_t snd_info_entry_llseek(struct file *file, loff_t offset, int orig) { struct snd_info_private_data *data; struct snd_info_entry *entry; loff_t size; data = file->private_data; entry = data->entry; guard(mutex)(&entry->access); if (entry->c.ops->llseek) return entry->c.ops->llseek(entry, data->file_private_data, file, offset, orig); size = entry->size; switch (orig) { case SEEK_SET: break; case SEEK_CUR: offset += file->f_pos; break; case SEEK_END: if (!size) return -EINVAL; offset += size; break; default: return -EINVAL; } if (offset < 0) return -EINVAL; if (size && offset > size) offset = size; file->f_pos = offset; return offset; } static ssize_t snd_info_entry_read(struct file *file, char __user *buffer, size_t count, loff_t * offset) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; size_t size; loff_t pos; pos = *offset; if (!valid_pos(pos, count)) return -EIO; if (pos >= entry->size) return 0; size = entry->size - pos; size = min(count, size); size = entry->c.ops->read(entry, data->file_private_data, file, buffer, size, pos); if ((ssize_t) size > 0) *offset = pos + size; return size; } static ssize_t snd_info_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t * offset) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; ssize_t size = 0; loff_t pos; pos = *offset; if (!valid_pos(pos, count)) return -EIO; if (count > 0) { size_t maxsize = entry->size - pos; count = min(count, maxsize); size = entry->c.ops->write(entry, data->file_private_data, file, buffer, count, pos); } if (size > 0) *offset = pos + size; return size; } static __poll_t snd_info_entry_poll(struct file *file, poll_table *wait) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; __poll_t mask = 0; if (entry->c.ops->poll) return entry->c.ops->poll(entry, data->file_private_data, file, wait); if (entry->c.ops->read) mask |= EPOLLIN | EPOLLRDNORM; if (entry->c.ops->write) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } static long snd_info_entry_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; if (!entry->c.ops->ioctl) return -ENOTTY; return entry->c.ops->ioctl(entry, data->file_private_data, file, cmd, arg); } static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct snd_info_private_data *data; struct snd_info_entry *entry; data = file->private_data; if (data == NULL) return 0; entry = data->entry; if (!entry->c.ops->mmap) return -ENXIO; return entry->c.ops->mmap(entry, data->file_private_data, inode, file, vma); } static int snd_info_entry_open(struct inode *inode, struct file *file) { struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int mode, err; guard(mutex)(&info_mutex); err = alloc_info_private(entry, &data); if (err < 0) return err; mode = file->f_flags & O_ACCMODE; if (((mode == O_RDONLY || mode == O_RDWR) && !entry->c.ops->read) || ((mode == O_WRONLY || mode == O_RDWR) && !entry->c.ops->write)) { err = -ENODEV; goto error; } if (entry->c.ops->open) { err = entry->c.ops->open(entry, mode, &data->file_private_data); if (err < 0) goto error; } file->private_data = data; return 0; error: kfree(data); module_put(entry->module); return err; } static int snd_info_entry_release(struct inode *inode, struct file *file) { struct snd_info_private_data *data = file->private_data; struct snd_info_entry *entry = data->entry; if (entry->c.ops->release) entry->c.ops->release(entry, file->f_flags & O_ACCMODE, data->file_private_data); module_put(entry->module); kfree(data); return 0; } static const struct proc_ops snd_info_entry_operations = { .proc_lseek = snd_info_entry_llseek, .proc_read = snd_info_entry_read, .proc_write = snd_info_entry_write, .proc_poll = snd_info_entry_poll, .proc_ioctl = snd_info_entry_ioctl, .proc_mmap = snd_info_entry_mmap, .proc_open = snd_info_entry_open, .proc_release = snd_info_entry_release, }; /* * file ops for text proc files */ static ssize_t snd_info_text_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *offset) { struct seq_file *m = file->private_data; struct snd_info_private_data *data = m->private; struct snd_info_entry *entry = data->entry; struct snd_info_buffer *buf; loff_t pos; size_t next; if (!entry->c.text.write) return -EIO; pos = *offset; if (!valid_pos(pos, count)) return -EIO; next = pos + count; /* don't handle too large text inputs */ if (next > 16 * 1024) return -EIO; guard(mutex)(&entry->access); buf = data->wbuffer; if (!buf) { data->wbuffer = buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; } if (next > buf->len) { char *nbuf = kvzalloc(PAGE_ALIGN(next), GFP_KERNEL); if (!nbuf) return -ENOMEM; kvfree(buf->buffer); buf->buffer = nbuf; buf->len = PAGE_ALIGN(next); } if (copy_from_user(buf->buffer + pos, buffer, count)) return -EFAULT; buf->size = next; *offset = next; return count; } static int snd_info_seq_show(struct seq_file *seq, void *p) { struct snd_info_private_data *data = seq->private; struct snd_info_entry *entry = data->entry; if (!entry->c.text.read) { return -EIO; } else { data->rbuffer->buffer = (char *)seq; /* XXX hack! */ entry->c.text.read(entry, data->rbuffer); } return 0; } static int snd_info_text_entry_open(struct inode *inode, struct file *file) { struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int err; guard(mutex)(&info_mutex); err = alloc_info_private(entry, &data); if (err < 0) return err; data->rbuffer = kzalloc(sizeof(*data->rbuffer), GFP_KERNEL); if (!data->rbuffer) { err = -ENOMEM; goto error; } if (entry->size) err = single_open_size(file, snd_info_seq_show, data, entry->size); else err = single_open(file, snd_info_seq_show, data); if (err < 0) goto error; return 0; error: kfree(data->rbuffer); kfree(data); module_put(entry->module); return err; } static int snd_info_text_entry_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct snd_info_private_data *data = m->private; struct snd_info_entry *entry = data->entry; if (data->wbuffer && entry->c.text.write) entry->c.text.write(entry, data->wbuffer); single_release(inode, file); kfree(data->rbuffer); if (data->wbuffer) { kvfree(data->wbuffer->buffer); kfree(data->wbuffer); } module_put(entry->module); kfree(data); return 0; } static const struct proc_ops snd_info_text_entry_ops = { .proc_open = snd_info_text_entry_open, .proc_release = snd_info_text_entry_release, .proc_write = snd_info_text_entry_write, .proc_lseek = seq_lseek, .proc_read = seq_read, }; static struct snd_info_entry *create_subdir(struct module *mod, const char *name) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(mod, name, NULL); if (!entry) return NULL; entry->mode = S_IFDIR | 0555; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); return NULL; } return entry; } static struct snd_info_entry * snd_info_create_entry(const char *name, struct snd_info_entry *parent, struct module *module); int __init snd_info_init(void) { snd_proc_root = snd_info_create_entry("asound", NULL, THIS_MODULE); if (!snd_proc_root) return -ENOMEM; snd_proc_root->mode = S_IFDIR | 0555; snd_proc_root->p = proc_mkdir("asound", NULL); if (!snd_proc_root->p) goto error; #ifdef CONFIG_SND_OSSEMUL snd_oss_root = create_subdir(THIS_MODULE, "oss"); if (!snd_oss_root) goto error; #endif #if IS_ENABLED(CONFIG_SND_SEQUENCER) snd_seq_root = create_subdir(THIS_MODULE, "seq"); if (!snd_seq_root) goto error; #endif if (snd_info_version_init() < 0 || snd_minor_info_init() < 0 || snd_minor_info_oss_init() < 0 || snd_card_info_init() < 0 || snd_info_minor_register() < 0) goto error; return 0; error: snd_info_free_entry(snd_proc_root); return -ENOMEM; } int __exit snd_info_done(void) { snd_info_free_entry(snd_proc_root); return 0; } static void snd_card_id_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_card *card = entry->private_data; snd_iprintf(buffer, "%s\n", card->id); } /* * create a card proc file * called from init.c */ int snd_info_card_create(struct snd_card *card) { char str[8]; struct snd_info_entry *entry; if (snd_BUG_ON(!card)) return -ENXIO; sprintf(str, "card%i", card->number); entry = create_subdir(card->module, str); if (!entry) return -ENOMEM; card->proc_root = entry; return snd_card_ro_proc_new(card, "id", card, snd_card_id_read); } /* * register the card proc file * called from init.c * can be called multiple times for reinitialization */ int snd_info_card_register(struct snd_card *card) { struct proc_dir_entry *p; int err; if (snd_BUG_ON(!card)) return -ENXIO; err = snd_info_register(card->proc_root); if (err < 0) return err; if (!strcmp(card->id, card->proc_root->name)) return 0; if (card->proc_root_link) return 0; p = proc_symlink(card->id, snd_proc_root->p, card->proc_root->name); if (!p) return -ENOMEM; card->proc_root_link = p; return 0; } /* * called on card->id change */ void snd_info_card_id_change(struct snd_card *card) { guard(mutex)(&info_mutex); if (card->proc_root_link) { proc_remove(card->proc_root_link); card->proc_root_link = NULL; } if (strcmp(card->id, card->proc_root->name)) card->proc_root_link = proc_symlink(card->id, snd_proc_root->p, card->proc_root->name); } /* * de-register the card proc file * called from init.c */ void snd_info_card_disconnect(struct snd_card *card) { if (!card) return; proc_remove(card->proc_root_link); if (card->proc_root) proc_remove(card->proc_root->p); guard(mutex)(&info_mutex); if (card->proc_root) snd_info_clear_entries(card->proc_root); card->proc_root_link = NULL; card->proc_root = NULL; } /* * release the card proc file resources * called from init.c */ int snd_info_card_free(struct snd_card *card) { if (!card) return 0; snd_info_free_entry(card->proc_root); card->proc_root = NULL; return 0; } /** * snd_info_get_line - read one line from the procfs buffer * @buffer: the procfs buffer * @line: the buffer to store * @len: the max. buffer size * * Reads one line from the buffer and stores the string. * * Return: Zero if successful, or 1 if error or EOF. */ int snd_info_get_line(struct snd_info_buffer *buffer, char *line, int len) { int c; if (snd_BUG_ON(!buffer)) return 1; if (!buffer->buffer) return 1; if (len <= 0 || buffer->stop || buffer->error) return 1; while (!buffer->stop) { c = buffer->buffer[buffer->curr++]; if (buffer->curr >= buffer->size) buffer->stop = 1; if (c == '\n') break; if (len > 1) { len--; *line++ = c; } } *line = '\0'; return 0; } EXPORT_SYMBOL(snd_info_get_line); /** * snd_info_get_str - parse a string token * @dest: the buffer to store the string token * @src: the original string * @len: the max. length of token - 1 * * Parses the original string and copy a token to the given * string buffer. * * Return: The updated pointer of the original string so that * it can be used for the next call. */ const char *snd_info_get_str(char *dest, const char *src, int len) { int c; while (*src == ' ' || *src == '\t') src++; if (*src == '"' || *src == '\'') { c = *src++; while (--len > 0 && *src && *src != c) { *dest++ = *src++; } if (*src == c) src++; } else { while (--len > 0 && *src && *src != ' ' && *src != '\t') { *dest++ = *src++; } } *dest = 0; while (*src == ' ' || *src == '\t') src++; return src; } EXPORT_SYMBOL(snd_info_get_str); /* * snd_info_create_entry - create an info entry * @name: the proc file name * @parent: the parent directory * * Creates an info entry with the given file name and initializes as * the default state. * * Usually called from other functions such as * snd_info_create_card_entry(). * * Return: The pointer of the new instance, or %NULL on failure. */ static struct snd_info_entry * snd_info_create_entry(const char *name, struct snd_info_entry *parent, struct module *module) { struct snd_info_entry *entry; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return NULL; entry->name = kstrdup(name, GFP_KERNEL); if (entry->name == NULL) { kfree(entry); return NULL; } entry->mode = S_IFREG | 0444; entry->content = SNDRV_INFO_CONTENT_TEXT; mutex_init(&entry->access); INIT_LIST_HEAD(&entry->children); INIT_LIST_HEAD(&entry->list); entry->parent = parent; entry->module = module; if (parent) { guard(mutex)(&parent->access); list_add_tail(&entry->list, &parent->children); } return entry; } /** * snd_info_create_module_entry - create an info entry for the given module * @module: the module pointer * @name: the file name * @parent: the parent directory * * Creates a new info entry and assigns it to the given module. * * Return: The pointer of the new instance, or %NULL on failure. */ struct snd_info_entry *snd_info_create_module_entry(struct module * module, const char *name, struct snd_info_entry *parent) { if (!parent) parent = snd_proc_root; return snd_info_create_entry(name, parent, module); } EXPORT_SYMBOL(snd_info_create_module_entry); /** * snd_info_create_card_entry - create an info entry for the given card * @card: the card instance * @name: the file name * @parent: the parent directory * * Creates a new info entry and assigns it to the given card. * * Return: The pointer of the new instance, or %NULL on failure. */ struct snd_info_entry *snd_info_create_card_entry(struct snd_card *card, const char *name, struct snd_info_entry * parent) { if (!parent) parent = card->proc_root; return snd_info_create_entry(name, parent, card->module); } EXPORT_SYMBOL(snd_info_create_card_entry); static void snd_info_clear_entries(struct snd_info_entry *entry) { struct snd_info_entry *p; if (!entry->p) return; list_for_each_entry(p, &entry->children, list) snd_info_clear_entries(p); entry->p = NULL; } /** * snd_info_free_entry - release the info entry * @entry: the info entry * * Releases the info entry. */ void snd_info_free_entry(struct snd_info_entry * entry) { struct snd_info_entry *p, *n; if (!entry) return; if (entry->p) { proc_remove(entry->p); guard(mutex)(&info_mutex); snd_info_clear_entries(entry); } /* free all children at first */ list_for_each_entry_safe(p, n, &entry->children, list) snd_info_free_entry(p); p = entry->parent; if (p) { guard(mutex)(&p->access); list_del(&entry->list); } kfree(entry->name); if (entry->private_free) entry->private_free(entry); kfree(entry); } EXPORT_SYMBOL(snd_info_free_entry); static int __snd_info_register(struct snd_info_entry *entry) { struct proc_dir_entry *root, *p = NULL; if (snd_BUG_ON(!entry)) return -ENXIO; root = entry->parent == NULL ? snd_proc_root->p : entry->parent->p; guard(mutex)(&info_mutex); if (entry->p || !root) return 0; if (S_ISDIR(entry->mode)) { p = proc_mkdir_mode(entry->name, entry->mode, root); if (!p) return -ENOMEM; } else { const struct proc_ops *ops; if (entry->content == SNDRV_INFO_CONTENT_DATA) ops = &snd_info_entry_operations; else ops = &snd_info_text_entry_ops; p = proc_create_data(entry->name, entry->mode, root, ops, entry); if (!p) return -ENOMEM; proc_set_size(p, entry->size); } entry->p = p; return 0; } /** * snd_info_register - register the info entry * @entry: the info entry * * Registers the proc info entry. * The all children entries are registered recursively. * * Return: Zero if successful, or a negative error code on failure. */ int snd_info_register(struct snd_info_entry *entry) { struct snd_info_entry *p; int err; if (!entry->p) { err = __snd_info_register(entry); if (err < 0) return err; } list_for_each_entry(p, &entry->children, list) { err = snd_info_register(p); if (err < 0) return err; } return 0; } EXPORT_SYMBOL(snd_info_register); /** * snd_card_rw_proc_new - Create a read/write text proc file entry for the card * @card: the card instance * @name: the file name * @private_data: the arbitrary private data * @read: the read callback * @write: the write callback, NULL for read-only * * This proc file entry will be registered via snd_card_register() call, and * it will be removed automatically at the card removal, too. * * Return: zero if successful, or a negative error code */ int snd_card_rw_proc_new(struct snd_card *card, const char *name, void *private_data, void (*read)(struct snd_info_entry *, struct snd_info_buffer *), void (*write)(struct snd_info_entry *entry, struct snd_info_buffer *buffer)) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(card, name, card->proc_root); if (!entry) return -ENOMEM; snd_info_set_text_ops(entry, private_data, read); if (write) { entry->mode |= 0200; entry->c.text.write = write; } return 0; } EXPORT_SYMBOL_GPL(snd_card_rw_proc_new); /* */ static void snd_info_version_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { snd_iprintf(buffer, "Advanced Linux Sound Architecture Driver Version k%s.\n", init_utsname()->release); } static int __init snd_info_version_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "version", NULL); if (entry == NULL) return -ENOMEM; entry->c.text.read = snd_info_version_read; return snd_info_register(entry); /* freed in error path */ } |
| 124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | /* * videobuf2-v4l2.h - V4L2 driver helper framework * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #ifndef _MEDIA_VIDEOBUF2_V4L2_H #define _MEDIA_VIDEOBUF2_V4L2_H #include <linux/videodev2.h> #include <media/videobuf2-core.h> #if VB2_MAX_FRAME != VIDEO_MAX_FRAME #error VB2_MAX_FRAME != VIDEO_MAX_FRAME #endif #if VB2_MAX_PLANES != VIDEO_MAX_PLANES #error VB2_MAX_PLANES != VIDEO_MAX_PLANES #endif struct video_device; /** * struct vb2_v4l2_buffer - video buffer information for v4l2. * * @vb2_buf: embedded struct &vb2_buffer. * @flags: buffer informational flags. * @field: field order of the image in the buffer, as defined by * &enum v4l2_field. * @timecode: frame timecode. * @sequence: sequence count of this frame. * @request_fd: the request_fd associated with this buffer * @is_held: if true, then this capture buffer was held * @planes: plane information (userptr/fd, length, bytesused, data_offset). * * Should contain enough information to be able to cover all the fields * of &struct v4l2_buffer at ``videodev2.h``. */ struct vb2_v4l2_buffer { struct vb2_buffer vb2_buf; __u32 flags; __u32 field; struct v4l2_timecode timecode; __u32 sequence; __s32 request_fd; bool is_held; struct vb2_plane planes[VB2_MAX_PLANES]; }; /* VB2 V4L2 flags as set in vb2_queue.subsystem_flags */ #define VB2_V4L2_FL_SUPPORTS_M2M_HOLD_CAPTURE_BUF (1 << 0) /* * to_vb2_v4l2_buffer() - cast struct vb2_buffer * to struct vb2_v4l2_buffer * */ #define to_vb2_v4l2_buffer(vb) \ container_of(vb, struct vb2_v4l2_buffer, vb2_buf) /** * vb2_find_buffer() - Find a buffer with given timestamp * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @timestamp: the timestamp to find. * * Returns the buffer with the given @timestamp, or NULL if not found. */ struct vb2_buffer *vb2_find_buffer(struct vb2_queue *q, u64 timestamp); int vb2_querybuf(struct vb2_queue *q, struct v4l2_buffer *b); /** * vb2_reqbufs() - Wrapper for vb2_core_reqbufs() that also verifies * the memory and type values. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @req: &struct v4l2_requestbuffers passed from userspace to * &v4l2_ioctl_ops->vidioc_reqbufs handler in driver. */ int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req); /** * vb2_create_bufs() - Wrapper for vb2_core_create_bufs() that also verifies * the memory and type values. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @create: creation parameters, passed from userspace to * &v4l2_ioctl_ops->vidioc_create_bufs handler in driver */ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create); /** * vb2_prepare_buf() - Pass ownership of a buffer from userspace to the kernel * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @mdev: pointer to &struct media_device, may be NULL. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver * * Should be called from &v4l2_ioctl_ops->vidioc_prepare_buf ioctl handler * of a driver. * * This function: * * #) verifies the passed buffer, * #) calls &vb2_ops->buf_prepare callback in the driver (if provided), * in which driver-specific buffer initialization can be performed. * #) if @b->request_fd is non-zero and @mdev->ops->req_queue is set, * then bind the prepared buffer to the request. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver. */ int vb2_prepare_buf(struct vb2_queue *q, struct media_device *mdev, struct v4l2_buffer *b); /** * vb2_qbuf() - Queue a buffer from userspace * @q: pointer to &struct vb2_queue with videobuf2 queue. * @mdev: pointer to &struct media_device, may be NULL. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_qbuf handler in driver * * Should be called from &v4l2_ioctl_ops->vidioc_qbuf handler of a driver. * * This function: * * #) verifies the passed buffer; * #) if @b->request_fd is non-zero and @mdev->ops->req_queue is set, * then bind the buffer to the request. * #) if necessary, calls &vb2_ops->buf_prepare callback in the driver * (if provided), in which driver-specific buffer initialization can * be performed; * #) if streaming is on, queues the buffer in driver by the means of * &vb2_ops->buf_queue callback for processing. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_qbuf handler in driver. */ int vb2_qbuf(struct vb2_queue *q, struct media_device *mdev, struct v4l2_buffer *b); /** * vb2_expbuf() - Export a buffer as a file descriptor * @q: pointer to &struct vb2_queue with videobuf2 queue. * @eb: export buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_expbuf handler in driver * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_expbuf handler in driver. */ int vb2_expbuf(struct vb2_queue *q, struct v4l2_exportbuffer *eb); /** * vb2_dqbuf() - Dequeue a buffer to the userspace * @q: pointer to &struct vb2_queue with videobuf2 queue. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_dqbuf handler in driver * @nonblocking: if true, this call will not sleep waiting for a buffer if no * buffers ready for dequeuing are present. Normally the driver * would be passing (&file->f_flags & %O_NONBLOCK) here * * Should be called from &v4l2_ioctl_ops->vidioc_dqbuf ioctl handler * of a driver. * * This function: * * #) verifies the passed buffer; * #) calls &vb2_ops->buf_finish callback in the driver (if provided), in which * driver can perform any additional operations that may be required before * returning the buffer to userspace, such as cache sync; * #) the buffer struct members are filled with relevant information for * the userspace. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_dqbuf handler in driver. */ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking); /** * vb2_streamon - start streaming * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: type argument passed from userspace to vidioc_streamon handler, * as defined by &enum v4l2_buf_type. * * Should be called from &v4l2_ioctl_ops->vidioc_streamon handler of a driver. * * This function: * * 1) verifies current state * 2) passes any previously queued buffers to the driver and starts streaming * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_streamon handler in the driver. */ int vb2_streamon(struct vb2_queue *q, enum v4l2_buf_type type); /** * vb2_streamoff - stop streaming * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: type argument passed from userspace to vidioc_streamoff handler * * Should be called from vidioc_streamoff handler of a driver. * * This function: * * #) verifies current state, * #) stop streaming and dequeues any queued buffers, including those previously * passed to the driver (after waiting for the driver to finish). * * This call can be used for pausing playback. * The return values from this function are intended to be directly returned * from vidioc_streamoff handler in the driver */ int vb2_streamoff(struct vb2_queue *q, enum v4l2_buf_type type); /** * vb2_queue_init() - initialize a videobuf2 queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * * The vb2_queue structure should be allocated by the driver. The driver is * responsible of clearing it's content and setting initial values for some * required entries before calling this function. * q->ops, q->mem_ops, q->type and q->io_modes are mandatory. Please refer * to the struct vb2_queue description in include/media/videobuf2-core.h * for more information. */ int __must_check vb2_queue_init(struct vb2_queue *q); /** * vb2_queue_init_name() - initialize a videobuf2 queue with a name * @q: pointer to &struct vb2_queue with videobuf2 queue. * @name: the queue name * * This function initializes the vb2_queue exactly like vb2_queue_init(), * and additionally sets the queue name. The queue name is used for logging * purpose, and should uniquely identify the queue within the context of the * device it belongs to. This is useful to attribute kernel log messages to the * right queue for m2m devices or other devices that handle multiple queues. */ int __must_check vb2_queue_init_name(struct vb2_queue *q, const char *name); /** * vb2_queue_release() - stop streaming, release the queue and free memory * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function stops streaming and performs necessary clean ups, including * freeing video buffer memory. The driver is responsible for freeing * the vb2_queue structure itself. */ void vb2_queue_release(struct vb2_queue *q); /** * vb2_queue_change_type() - change the type of an inactive vb2_queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: the type to change to (V4L2_BUF_TYPE_VIDEO_*) * * This function changes the type of the vb2_queue. This is only possible * if the queue is not busy (i.e. no buffers have been allocated). * * vb2_queue_change_type() can be used to support multiple buffer types using * the same queue. The driver can implement v4l2_ioctl_ops.vidioc_reqbufs and * v4l2_ioctl_ops.vidioc_create_bufs functions and call vb2_queue_change_type() * before calling vb2_ioctl_reqbufs() or vb2_ioctl_create_bufs(), and thus * "lock" the buffer type until the buffers have been released. */ int vb2_queue_change_type(struct vb2_queue *q, unsigned int type); /** * vb2_poll() - implements poll userspace operation * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: file argument passed to the poll file operation handler * @wait: wait argument passed to the poll file operation handler * * This function implements poll file operation handler for a driver. * For CAPTURE queues, if a buffer is ready to be dequeued, the userspace will * be informed that the file descriptor of a video device is available for * reading. * For OUTPUT queues, if a buffer is ready to be dequeued, the file descriptor * will be reported as available for writing. * * If the driver uses struct v4l2_fh, then vb2_poll() will also check for any * pending events. * * The return values from this function are intended to be directly returned * from poll handler in driver. */ __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait); /* * The following functions are not part of the vb2 core API, but are simple * helper functions that you can use in your struct v4l2_file_operations, * struct v4l2_ioctl_ops and struct vb2_ops. They will serialize if vb2_queue->lock * or video_device->lock is set, and they will set and test the queue owner * (vb2_queue->owner) to check if the calling filehandle is permitted to do the * queuing operation. */ /** * vb2_queue_is_busy() - check if the queue is busy * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: file through which the vb2 queue access is performed * * The queue is considered busy if it has an owner and the owner is not the * @file. * * Queue ownership is acquired and checked by some of the v4l2_ioctl_ops helpers * below. Drivers can also use this function directly when they need to * open-code ioctl handlers, for instance to add additional checks between the * queue ownership test and the call to the corresponding vb2 operation. */ static inline bool vb2_queue_is_busy(struct vb2_queue *q, struct file *file) { return q->owner && q->owner != file->private_data; } /* struct v4l2_ioctl_ops helpers */ int vb2_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *p); int vb2_ioctl_create_bufs(struct file *file, void *priv, struct v4l2_create_buffers *p); int vb2_ioctl_prepare_buf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_querybuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_qbuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_dqbuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type i); int vb2_ioctl_streamoff(struct file *file, void *priv, enum v4l2_buf_type i); int vb2_ioctl_expbuf(struct file *file, void *priv, struct v4l2_exportbuffer *p); int vb2_ioctl_remove_bufs(struct file *file, void *priv, struct v4l2_remove_buffers *p); /* struct v4l2_file_operations helpers */ int vb2_fop_mmap(struct file *file, struct vm_area_struct *vma); int vb2_fop_release(struct file *file); int _vb2_fop_release(struct file *file, struct mutex *lock); ssize_t vb2_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos); ssize_t vb2_fop_read(struct file *file, char __user *buf, size_t count, loff_t *ppos); __poll_t vb2_fop_poll(struct file *file, poll_table *wait); #ifndef CONFIG_MMU unsigned long vb2_fop_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif /** * vb2_video_unregister_device - unregister the video device and release queue * * @vdev: pointer to &struct video_device * * If the driver uses vb2_fop_release()/_vb2_fop_release(), then it should use * vb2_video_unregister_device() instead of video_unregister_device(). * * This function will call video_unregister_device() and then release the * vb2_queue if streaming is in progress. This will stop streaming and * this will simplify the unbind sequence since after this call all subdevs * will have stopped streaming as well. */ void vb2_video_unregister_device(struct video_device *vdev); /** * vb2_ops_wait_prepare - helper function to lock a struct &vb2_queue * * @vq: pointer to &struct vb2_queue * * ..note:: only use if vq->lock is non-NULL. */ void vb2_ops_wait_prepare(struct vb2_queue *vq); /** * vb2_ops_wait_finish - helper function to unlock a struct &vb2_queue * * @vq: pointer to &struct vb2_queue * * ..note:: only use if vq->lock is non-NULL. */ void vb2_ops_wait_finish(struct vb2_queue *vq); struct media_request; int vb2_request_validate(struct media_request *req); void vb2_request_queue(struct media_request *req); #endif /* _MEDIA_VIDEOBUF2_V4L2_H */ |
| 429 121 1011 547 3 3 45 45 45 45 45 416 672 799 796 799 699 780 782 730 796 797 740 782 729 781 782 734 484 484 482 484 484 850 851 543 542 543 543 543 543 543 7 851 851 853 853 597 579 854 589 695 854 776 854 853 1 852 1 777 854 854 851 1582 753 752 754 754 753 754 753 1 1 1 1 1 1 1 1 2052 903 903 902 901 2060 2043 962 962 2057 903 901 903 2055 2055 2012 2008 521 942 943 833 832 833 920 922 921 922 880 377 377 16 16 218 17 807 819 210 210 205 207 207 207 3 207 433 434 434 204 609 525 511 22 525 520 8 525 526 543 543 542 542 543 543 543 543 3 540 542 421 421 4 4 3 1 24 8 18 220 199 1 1 1 18 3 1 1 1 1 1 1 1 2 1 6 22 9 1 1 1 1 8 1 184 1 2 4 2 2 4 4 2 2 842 818 6 1 1 1 3 1 1 1 1 2 2 2 1 4 1 2 2 1 1 1 2 4 1 1 1 2 1 1 4 4 14 27 1 6 2 1 1 1 778 3 776 4 720 720 59 703 2 63 23 73 418 418 418 7 2 2 47 47 47 418 418 418 492 492 2 490 2 1 1 1 737 738 60 628 1 629 60 60 630 629 602 6 6 6 6 2 2 2 2 2 2 105 105 502 501 502 502 502 499 2 1 1 502 502 630 630 630 630 630 629 627 5 627 92 92 491 491 491 491 491 491 491 491 490 491 782 782 763 764 706 705 706 107 106 106 107 107 107 107 107 106 107 675 675 38 38 38 83 83 83 44 44 44 83 83 83 425 425 425 425 425 7 418 2 47 578 580 579 580 580 579 580 24 24 24 24 580 577 3 3 3 3 579 578 580 580 580 579 580 579 579 580 580 580 30 30 30 30 30 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 464 464 464 464 118 413 459 11 464 462 464 464 463 464 3 461 464 464 464 463 464 414 414 414 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 413 118 413 413 413 413 425 18 413 424 425 425 425 425 425 425 425 425 425 424 425 425 425 425 425 425 425 425 425 425 425 425 425 78 79 10 10 1 9 111 111 110 2 111 111 43 43 43 43 104 26 103 46 20 67 44 44 44 44 44 44 661 148 781 780 779 781 781 1 1 1 357 358 355 21 21 1 20 13 4 3 4 404 28 216 217 115 141 1 16 3 21 21 16 3 2 59 59 1 58 57 1 49 8 8 49 49 49 484 485 1 10 10 753 753 507 631 751 753 752 143 33 114 112 3 1 1 870 870 870 55 3 391 391 391 390 2 65 65 849 849 60 6 720 516 792 60 77 9 404 144 766 851 851 851 156 156 156 24 24 24 23 1 24 277 277 7 272 272 272 122 122 358 405 850 850 405 358 589 564 564 829 736 505 42 8 424 1 2 637 638 638 849 851 112 3 21 24 619 556 791 790 77 58 18 4 851 849 851 850 849 850 606 851 770 850 49 850 850 850 832 18 851 851 851 851 851 850 851 851 6 847 847 60 793 793 45 45 45 45 414 414 414 414 414 414 120 413 413 413 2 2 2 318 318 318 318 318 318 509 508 509 464 26 26 26 26 26 26 26 25 26 26 26 26 26 26 25 10 11 11 11 11 11 10 25 464 10 453 464 444 26 463 464 1 1 56 56 53 3 6 51 26 10 32 32 699 700 94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 | // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/kernel.h> #include <linux/kvm_host.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mod_devicetable.h> #include <linux/mm.h> #include <linux/objtool.h> #include <linux/sched.h> #include <linux/sched/smt.h> #include <linux/slab.h> #include <linux/tboot.h> #include <linux/trace_events.h> #include <asm/apic.h> #include <asm/asm.h> #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> #include <asm/fred.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> #include <asm/reboot.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> #include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> #include <trace/events/ipi.h> #include "capabilities.h" #include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" #include "mmu.h" #include "nested.h" #include "pmu.h" #include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" #include "vmx.h" #include "x86.h" #include "x86_ops.h" #include "smm.h" #include "vmx_onhyperv.h" #include "posted_intr.h" #include "mmu/spte.h" MODULE_AUTHOR("Qumranet"); MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE static const struct x86_cpu_id vmx_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); #endif bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); static bool __read_mostly enable_vnmi = 1; module_param_named(vnmi, enable_vnmi, bool, 0444); bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, 0444); bool __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, 0444); bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, 0444); bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, 0444); static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, 0444); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, 0444); module_param(enable_apicv, bool, 0444); module_param(enable_ipiv, bool, 0444); module_param(enable_device_posted_irqs, bool, 0444); /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ static bool __read_mostly nested = 1; module_param(nested, bool, 0444); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, 0444); static bool __read_mostly error_on_inconsistent_vmcs_config = true; module_param(error_on_inconsistent_vmcs_config, bool, 0444); static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ static int __read_mostly cpu_preemption_timer_multi; static bool __read_mostly enable_preemption_timer = 1; #ifdef CONFIG_X86_64 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif extern bool __read_mostly allow_smaller_maxphyaddr; module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; module_param(ple_gap, uint, 0444); static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, uint, 0444); /* Default doubles per-vcpu window every exit. */ static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; module_param(ple_window_grow, uint, 0444); /* Default resets per-vcpu window every exit to ple_window. */ static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; module_param(ple_window_shrink, uint, 0444); /* Default is to compute the maximum so we can never overflow. */ static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; #ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); #endif struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); /* Storage for pre module init parameter parsing */ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; static const struct { const char *option; bool for_parse; } vmentry_l1d_param[] = { [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, [VMENTER_L1D_FLUSH_COND] = {"cond", true}, [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, }; #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; unsigned int i; if (!boot_cpu_has_bug(X86_BUG_L1TF)) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; } if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } /* If set to auto use the default l1tf mitigation method */ if (l1tf == VMENTER_L1D_FLUSH_AUTO) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: l1tf = VMENTER_L1D_FLUSH_COND; break; case L1TF_MITIGATION_FULL: case L1TF_MITIGATION_FULL_FORCE: l1tf = VMENTER_L1D_FLUSH_ALWAYS; break; } } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { l1tf = VMENTER_L1D_FLUSH_ALWAYS; } if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { /* * This allocation for vmx_l1d_flush_pages is not tied to a VM * lifetime and so should not be charged to a memcg. */ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM; vmx_l1d_flush_pages = page_address(page); /* * Initialize each page with a different pattern in * order to protect against KSM in the nested * virtualization case. */ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, PAGE_SIZE); } } l1tf_vmx_mitigation = l1tf; if (l1tf != VMENTER_L1D_FLUSH_NEVER) static_branch_enable(&vmx_l1d_should_flush); else static_branch_disable(&vmx_l1d_should_flush); if (l1tf == VMENTER_L1D_FLUSH_COND) static_branch_enable(&vmx_l1d_flush_cond); else static_branch_disable(&vmx_l1d_flush_cond); return 0; } static int vmentry_l1d_flush_parse(const char *s) { unsigned int i; if (s) { for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { if (vmentry_l1d_param[i].for_parse && sysfs_streq(s, vmentry_l1d_param[i].option)) return i; } } return -EINVAL; } static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) { int l1tf, ret; l1tf = vmentry_l1d_flush_parse(s); if (l1tf < 0) return l1tf; if (!boot_cpu_has(X86_BUG_L1TF)) return 0; /* * Has vmx_init() run already? If not then this is the pre init * parameter parsing. In that case just store the value and let * vmx_init() do the proper setup after enable_ept has been * established. */ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { vmentry_l1d_flush_param = l1tf; return 0; } mutex_lock(&vmx_l1d_flush_mutex); ret = vmx_setup_l1d_flush(l1tf); mutex_unlock(&vmx_l1d_flush_mutex); return ret; } static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) return sysfs_emit(s, "???\n"); return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { u64 msr; if (!vmx->disable_fb_clear) return; msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) { if (!vmx->disable_fb_clear) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { /* * Disable VERW's behavior of clearing CPU buffers for the guest if the * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled * the mitigation. Disabling the clearing behavior provides a * performance boost for guests that aren't aware that manually clearing * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry * and VM-Exit. */ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); /* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS * at VMEntry. Skip the MSR read/write when a guest has no use case to * execute VERW. */ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) vmx->disable_fb_clear = false; } static const struct kernel_param_ops vmentry_l1d_flush_ops = { .set = vmentry_l1d_flush_set, .get = vmentry_l1d_flush_get, }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); static u32 vmx_segment_access_rights(struct kvm_segment *var); void vmx_vmexit(void); #define vmx_insn_failed(fmt...) \ do { \ WARN_ONCE(1, fmt); \ pr_warn_ratelimited(fmt); \ } while (0) noinline void vmread_error(unsigned long field) { vmx_insn_failed("vmread failed: field=%lx\n", field); } #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT noinstr void vmread_error_trampoline2(unsigned long field, bool fault) { if (fault) { kvm_spurious_fault(); } else { instrumentation_begin(); vmread_error(field); instrumentation_end(); } } #endif noinline void vmwrite_error(unsigned long field, unsigned long value) { vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp) { vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); } static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability vmx_capability __ro_after_init; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ .selector = GUEST_##seg##_SELECTOR, \ .base = GUEST_##seg##_BASE, \ .limit = GUEST_##seg##_LIMIT, \ .ar_bytes = GUEST_##seg##_AR_BYTES, \ } static const struct kvm_vmx_segment_field { unsigned selector; unsigned base; unsigned limit; unsigned ar_bytes; } kvm_vmx_segment_fields[] = { VMX_SEGMENT_FIELD(CS), VMX_SEGMENT_FIELD(DS), VMX_SEGMENT_FIELD(ES), VMX_SEGMENT_FIELD(FS), VMX_SEGMENT_FIELD(GS), VMX_SEGMENT_FIELD(SS), VMX_SEGMENT_FIELD(TR), VMX_SEGMENT_FIELD(LDTR), }; static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); if (partition_assist_page == INVALID_PAGE) return -ENOMEM; evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; evmcs->partition_assist_page = partition_assist_page; evmcs->hv_vm_id = (unsigned long)vcpu->kvm; evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; return 0; } static __init void hv_init_evmcs(void) { int cpu; if (!enlightened_vmcs) return; /* * Enlightened VMCS usage should be recommended and the host needs * to support eVMCS v1 or above. */ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= KVM_EVMCS_VERSION) { /* Check that we have assist pages on all online CPUs */ for_each_online_cpu(cpu) { if (!hv_get_vp_assist_page(cpu)) { enlightened_vmcs = false; break; } } if (enlightened_vmcs) { pr_info("Using Hyper-V Enlightened VMCS\n"); static_branch_enable(&__kvm_is_using_evmcs); } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) vt_x86_ops.enable_l2_tlb_flush = hv_enable_l2_tlb_flush; } else { enlightened_vmcs = false; } } static void hv_reset_evmcs(void) { struct hv_vp_assist_page *vp_ap; if (!kvm_is_using_evmcs()) return; /* * KVM should enable eVMCS if and only if all CPUs have a VP assist * page, and should reject CPU onlining if eVMCS is enabled the CPU * doesn't have a VP assist page allocated. */ vp_ap = hv_get_vp_assist_page(smp_processor_id()); if (WARN_ON_ONCE(!vp_ap)) return; /* * Reset everything to support using non-enlightened VMCS access later * (e.g. when we reload the module with enlightened_vmcs=0) */ vp_ap->nested_control.features.directhypercall = 0; vp_ap->current_nested_vmcs = 0; vp_ap->enlighten_vmentry = 0; } #else /* IS_ENABLED(CONFIG_HYPERV) */ static void hv_init_evmcs(void) {} static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* * Comment's format: document - errata name - stepping - processor name. * Refer from * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp */ static u32 vmx_preemption_cpu_tfms[] = { /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 0x000206E6, /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 0x00020652, /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 0x00020655, /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ /* * 320767.pdf - AAP86 - B1 - * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile */ 0x000106E5, /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 0x000106A0, /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 0x000106A1, /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 0x000106A4, /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 0x000106A5, /* Xeon E3-1220 V2 */ 0x000306A8, }; static inline bool cpu_has_broken_vmx_preemption_timer(void) { u32 eax = cpuid_eax(0x00000001), i; /* Clear the reserved bits */ eax &= ~(0x3U << 14 | 0xfU << 28); for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) if (eax == vmx_preemption_cpu_tfms[i]) return true; return false; } static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); } struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; i = kvm_find_user_return_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL; } static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data) { unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; if (msr->load_into_hardware) { preempt_disable(); ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); } if (!ret) msr->data = data; return ret; } /* * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) * * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to * atomically track post-VMXON state, e.g. this may be called in NMI context. * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. * faults are guaranteed to be due to the !post-VMXON check unless the CPU is * magically in RM, VM86, compat mode, or at CPL>0. */ static int kvm_cpu_vmxoff(void) { asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); cr4_clear_bits(X86_CR4_VMXE); return 0; fault: cr4_clear_bits(X86_CR4_VMXE); return -EIO; } void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; kvm_rebooting = true; /* * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be * set in task context. If this races with VMX is disabled by an NMI, * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to * kvm_rebooting set. */ if (!(__read_cr4() & X86_CR4_VMXE)) return; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); if (v->shadow_vmcs) vmcs_clear(v->shadow_vmcs); } kvm_cpu_vmxoff(); } static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; int cpu = raw_smp_processor_id(); if (loaded_vmcs->cpu != cpu) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; vmcs_clear(loaded_vmcs->vmcs); if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) vmcs_clear(loaded_vmcs->shadow_vmcs); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* * Ensure all writes to loaded_vmcs, including deleting it from its * current percpu list, complete before setting loaded_vmcs->cpu to * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first * and add loaded_vmcs to its percpu list before it's deleted from this * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; } void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; if (cpu != -1) smp_call_function_single(cpu, __loaded_vmcs_clear, loaded_vmcs, 1); } static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, unsigned field) { bool ret; u32 mask = 1 << (seg * SEG_FIELD_NR + field); if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); vmx->segment_cache.bitmask = 0; } ret = vmx->segment_cache.bitmask & mask; vmx->segment_cache.bitmask |= mask; return ret; } static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) { u16 *p = &vmx->segment_cache.seg[seg].selector; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); return *p; } static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) { ulong *p = &vmx->segment_cache.seg[seg].base; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); return *p; } static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) { u32 *p = &vmx->segment_cache.seg[seg].limit; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); return *p; } static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) { u32 *p = &vmx->segment_cache.seg[seg].ar; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); return *p; } void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); /* * #VE isn't used for VMX. To test against unexpected changes * related to #VE for VMX, intercept unexpected #VE and warn on it. */ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) eb |= 1u << VE_VECTOR; /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway * as VMware does. */ if (enable_vmware_backdoor) eb |= (1u << GP_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (!vmx_need_pf_intercept(vcpu)) eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass * them to L1. When running L2, we will only handle the exceptions * specified above if L1 did not want them. */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; else { int mask = 0, match = 0; if (enable_ept && (eb & (1u << PF_VECTOR))) { /* * If EPT is enabled, #PF is currently only intercepted * if MAXPHYADDR is smaller on the guest than on the * host. In that case we only care about present, * non-reserved faults. For vmcs02, however, PFEC_MASK * and PFEC_MATCH are set in prepare_vmcs02_rare. */ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; match = PFERR_PRESENT_MASK; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } /* * Disabling xfd interception indicates that dynamic xfeatures * might be used in the guest. Always trap #NM in this case * to save guest xfd_err timely. */ if (vcpu->arch.xfd_no_write_intercept) eb |= (1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) { unsigned int flags = 0; if (vmx->loaded_vmcs->launched) flags |= VMX_RUN_VMRESUME; /* * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; if (static_branch_unlikely(&cpu_buf_vm_clear) && kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; return flags; } static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { vm_entry_controls_clearbit(vmx, entry); vm_exit_controls_clearbit(vmx, exit); } int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; for (i = 0; i < m->nr; ++i) { if (m->val[i].index == msr) return i; } return -ENOENT; } static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { int i; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer()) { clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl()) { clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); return; } break; } i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; m->guest.val[i] = m->guest.val[m->guest.nr]; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: i = vmx_find_loadstore_msr_slot(&m->host, msr); if (i < 0) return; --m->host.nr; m->host.val[i] = m->host.val[m->host.nr]; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit, unsigned long guest_val_vmcs, unsigned long host_val_vmcs, u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); if (host_val_vmcs != HOST_IA32_EFER) vmcs_write64(host_val_vmcs, host_val); vm_entry_controls_setbit(vmx, entry); vm_exit_controls_setbit(vmx, exit); } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val, bool entry_only) { int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer()) { add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, GUEST_IA32_EFER, HOST_IA32_EFER, guest_val, host_val); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl()) { add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, GUEST_IA32_PERF_GLOBAL_CTRL, HOST_IA32_PERF_GLOBAL_CTRL, guest_val, host_val); return; } break; case MSR_IA32_PEBS_ENABLE: /* PEBS needs a quiescent period after being disabled (to write * a record). Disabling PEBS through VMX MSR swapping doesn't * provide that period, so a CPU could write host's record into * guest's memory. */ wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only) j = vmx_find_loadstore_msr_slot(&m->host, msr); if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } if (i < 0) { i = m->guest.nr++; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); } m->guest.val[i].index = msr; m->guest.val[i].value = guest_val; if (entry_only) return; if (j < 0) { j = m->host.nr++; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } m->host.val[j].index = msr; m->host.val[j].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; int i; /* Shadow paging assumes NX to be available. */ if (!enable_ept) guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. */ ignore_bits |= EFER_SCE; #ifdef CONFIG_X86_64 ignore_bits |= EFER_LMA | EFER_LME; /* SCE is meaningful only in long mode on Intel */ if (guest_efer & EFER_LMA) ignore_bits &= ~(u64)EFER_SCE; #endif /* * On EPT, we can't emulate NX, so we must switch EFER atomically. * On CPUs that support "load IA32_EFER", always switch EFER * atomically, since it's faster than switching it manually. */ if (cpu_has_load_ia32_efer() || (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; if (guest_efer != kvm_host.efer) add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer, false); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; } i = kvm_find_user_return_msr(MSR_EFER); if (i < 0) return false; clear_atomic_switch_msr(vmx, MSR_EFER); guest_efer &= ~ignore_bits; guest_efer |= kvm_host.efer & ignore_bits; vmx->guest_uret_msrs[i].data = guest_efer; vmx->guest_uret_msrs[i].mask = ~ignore_bits; return true; } #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the * VMCS rather than the segment table. KVM uses this helper to figure * out the current bases to poke them into the VMCS before entry. */ static unsigned long segment_base(u16 selector) { struct desc_struct *table; unsigned long v; if (!(selector & ~SEGMENT_RPL_MASK)) return 0; table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); if (!(ldt_selector & ~SEGMENT_RPL_MASK)) return 0; table = (struct desc_struct *)segment_base(ldt_selector); } v = get_desc_base(&table[selector >> 3]); return v; } #endif static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) { return vmx_pt_mode_is_host_guest() && !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } static void pt_guest_enter(struct vcpu_vmx *vmx) { if (vmx_pt_mode_is_system()) return; /* * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { wrmsrq(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } } static void pt_guest_exit(struct vcpu_vmx *vmx) { if (vmx_pt_mode_is_system()) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); } /* * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. */ if (vmx->pt_desc.host.ctl) wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base) { if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); else vmcs_write16(HOST_FS_SELECTOR, 0); host->fs_sel = fs_sel; } if (unlikely(gs_sel != host->gs_sel)) { if (!(gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, gs_sel); else vmcs_write16(HOST_GS_SELECTOR, 0); host->gs_sel = gs_sel; } if (unlikely(fs_base != host->fs_base)) { vmcs_writel(HOST_FS_BASE, fs_base); host->fs_base = fs_base; } if (unlikely(gs_base != host->gs_base)) { vmcs_writel(HOST_GS_BASE, gs_base); host->gs_base = gs_base; } } void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; /* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ if (!vmx->guest_uret_msrs_loaded) { vmx->guest_uret_msrs_loaded = true; for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (!vmx->guest_uret_msrs[i].load_into_hardware) continue; kvm_set_user_return_msr(i, vmx->guest_uret_msrs[i].data, vmx->guest_uret_msrs[i].mask); } } if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 savesegment(ds, host_state->ds_sel); savesegment(es, host_state->es_sel); gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { current_save_fsgs(); fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = segment_base(fs_sel); gs_base = segment_base(gs_sel); #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 load_gs_index(host_state->gs_sel); #else loadsegment(gs, host_state->gs_sel); #endif } if (host_state->fs_sel & 7) loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 if (unlikely(host_state->ds_sel | host_state->es_sel)) { loadsegment(ds, host_state->ds_sel); loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } #ifdef CONFIG_X86_64 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) { preempt_disable(); if (vmx->vt.guest_state_loaded) *cache = read_msr(msr); preempt_enable(); return *cache; } static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data, u64 *cache) { preempt_disable(); if (vmx->vt.guest_state_loaded) wrmsrns(msr, data); preempt_enable(); *cache = data; } static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, &vmx->msr_guest_kernel_gs_base); } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data, &vmx->msr_guest_kernel_gs_base); } #endif static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned int old = vmx->ple_window; vmx->ple_window = __grow_ple_window(old, ple_window, ple_window_grow, ple_window_max); if (vmx->ple_window != old) { vmx->ple_window_dirty = true; trace_kvm_ple_window_update(vcpu->vcpu_id, vmx->ple_window, old); } } static void shrink_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned int old = vmx->ple_window; vmx->ple_window = __shrink_ple_window(old, ple_window, ple_window_shrink, ple_window); if (vmx->ple_window != old) { vmx->ple_window_dirty = true; trace_kvm_ple_window_update(vcpu->vcpu_id, vmx->ple_window, old); } } void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; struct vmcs *prev; if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); /* * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to * this cpu's percpu list, otherwise it may not yet be deleted * from its previous cpu's percpu list. Pairs with the * smb_wmb() in __loaded_vmcs_clear(). */ smp_rmb(); list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); local_irq_enable(); } prev = per_cpu(current_vmcs, cpu); if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); } if (!already_loaded) { void *gdt = get_current_gdt_ro(); /* * Flush all EPTP/VPID contexts, the new pCPU may have stale * TLB entries from its previous association with the vCPU. */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { /* 22.2.3 */ vmcs_writel(HOST_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); } vmx->loaded_vmcs->cpu = cpu; } } /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); } void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); vmx_prepare_switch_to_host(to_vmx(vcpu)); } bool vmx_emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long rflags, save_rflags; if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); rflags = vmcs_readl(GUEST_RFLAGS); if (vmx->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; save_rflags = vmx->rmode.save_rflags; rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; } vmx->rflags = rflags; } return vmx->rflags; } void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; /* * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU * is an unrestricted guest in order to mark L2 as needing emulation * if L1 runs L2 as a restricted guest. */ if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); return; } old_rflags = vmx_get_rflags(vcpu); vmx->rflags = rflags; if (vmx->rmode.vm86_active) { vmx->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) vmx->vt.emulation_required = vmx_emulation_required(vcpu); } bool vmx_get_if_flag(struct kvm_vcpu *vcpu) { return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) ret |= KVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; return ret; } void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); u32 interruptibility = interruptibility_old; interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); if (mask & KVM_X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; else if (mask & KVM_X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long value; /* * Any MSR write that attempts to change bits marked reserved will * case a #GP fault. */ if (data & vmx->pt_desc.ctl_bitmask) return 1; /* * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will * result in a #GP unless the same write also clears TraceEn. */ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && (data & RTIT_CTL_TRACEEN) && data != vmx->pt_desc.guest.ctl) return 1; /* * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit * and FabricEn would cause #GP, if * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 */ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && !(data & RTIT_CTL_FABRIC_EN) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; /* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that * utilize encodings marked reserved will cause a #GP fault. */ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && !test_bit((data & RTIT_CTL_MTC_RANGE) >> RTIT_CTL_MTC_RANGE_OFFSET, &value)) return 1; value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cycle_thresholds); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && !test_bit((data & RTIT_CTL_CYC_THRESH) >> RTIT_CTL_CYC_THRESH_OFFSET, &value)) return 1; value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && !test_bit((data & RTIT_CTL_PSB_FREQ) >> RTIT_CTL_PSB_FREQ_OFFSET, &value)) return 1; /* * If ADDRx_CFG is reserved or the encodings is >2 will * cause a #GP fault. */ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) return 1; return 0; } int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } /* Check that emulation is possible during event vectoring */ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && !kvm_can_emulate_event_vectoring(emul_type)) return X86EMUL_UNHANDLEABLE_VECTORING; return X86EMUL_CONTINUE; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on * undefined behavior: Intel's SDM doesn't mandate the VMCS field be * set when EPT misconfig occurs. In practice, real hardware updates * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors * (namely Hyper-V) don't set it due to it being undefined behavior, * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); /* * Emulating an enclave's instructions isn't supported as KVM * cannot access the enclave's memory or its true RIP, e.g. the * vmcs.GUEST_RIP points at the exit point of the enclave, not * the RIP that actually triggered the VM-Exit. But, because * most instructions that cause VM-Exit will #UD in an enclave, * most instruction-based VM-Exits simply do not occur. * * There are a few exceptions, notably the debug instructions * INT1ICEBRK and INT3, as they are allowed in debug enclaves * and generate #DB/#BP as expected, which KVM might intercept. * But again, the CPU does the dirty work and saves an instr * length of zero so VMMs don't shoot themselves in the foot. * WARN if KVM tries to skip a non-zero length instruction on * a VM-Exit from an enclave. */ if (!instr_len) goto rip_updated; WARN_ONCE(exit_reason.enclave_mode, "skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit * mode, but just finding out that we are in 64-bit mode is * quite expensive. Only do it if there was a carry. */ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) rip = (u32)rip; #endif kvm_rip_write(vcpu, rip); } else { if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; } rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); return 1; } /* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); if (!is_guest_mode(vcpu)) return; /* * Per the SDM, MTF takes priority over debug-trap exceptions besides * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps * or ICEBP (in the emulator proper), and skipping of ICEBP after an * intercepted #DB deliberately avoids single-step #DB and MTF updates * as ICEBP is higher priority than both. As instruction emulation is * completed at this point (i.e. KVM is at the instruction boundary), * any #DB exception pending delivery must be a debug-trap of lower * priority than MTF. Record the pending MTF state to be delivered in * vmx_check_nested_events(). */ if (nested_cpu_has_mtf(vmcs12) && (!vcpu->arch.exception.pending || vcpu->arch.exception.vector == DB_VECTOR) && (!vcpu->arch.exception_vmexit.pending || vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { vmx->nested.mtf_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); } else { vmx->nested.mtf_pending = false; } } int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) { vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu); } static void vmx_clear_hlt(struct kvm_vcpu *vcpu) { /* * Ensure that we clear the HLT state in the VMCS. We don't need to * explicitly skip the instruction because if the HLT state is set, * then the instruction is already executing and RIP has already been * advanced. */ if (kvm_hlt_in_guest(vcpu->kvm) && vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } void vmx_inject_exception(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vcpu_vmx *vmx = to_vmx(vcpu); kvm_deliver_exception_payload(vcpu, ex); if (ex->has_error_code) { /* * Despite the error code being architecturally defined as 32 * bits, and the VMCS field being 32 bits, Intel CPUs and thus * VMX don't actually supporting setting bits 31:16. Hardware * will (should) never provide a bogus error code, but AMD CPUs * do generate error codes with bits 31:16 set, and so KVM's * ABI lets userspace shove in arbitrary 32-bit values. Drop * the upper bits to avoid VM-Fail, losing information that * doesn't really exist is preferable to killing the VM. */ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (vmx->rmode.vm86_active) { int inc_eip = 0; if (kvm_exception_is_soft(ex->vector)) inc_eip = vcpu->arch.event_exit_inst_len; kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); return; } WARN_ON_ONCE(vmx->vt.emulation_required); if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmx_clear_hlt(vcpu); } static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, bool load_into_hardware) { struct vmx_uret_msr *uret_msr; uret_msr = vmx_find_uret_msr(vmx, msr); if (!uret_msr) return; uret_msr->load_into_hardware = load_into_hardware; } /* * Configuring user return MSRs to automatically save, load, and restore MSRs * that need to be shoved into hardware when running the guest. Note, omitting * an MSR here does _NOT_ mean it's not emulated, only that it will not be * loaded into hardware when running the guest. */ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ load_syscall_msrs = is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE); vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); #endif vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); vmx_setup_uret_msr(vmx, MSR_TSC_AUX, guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); /* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new * kernel and old userspace. If those guests run on a tsx=off host, do * allow guests to use TSX_CTRL, but don't change the value in hardware * so that TSX remains always disabled. */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. */ vmx->guest_uret_msrs_loaded = false; } u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) return vmcs12->tsc_offset; return 0; } u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; return kvm_caps.default_tsc_scaling_ratio; } void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } /* * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain * backwards compatibility even though KVM doesn't support emulating SMX. And * because userspace set "VMX in SMX", the guest must also be allowed to set it, * e.g. if the MSR is left unlocked and the guest does a RMW operation. */ #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ FEAT_CTL_SGX_LC_ENABLED | \ FEAT_CTL_SGX_ENABLED | \ FEAT_CTL_LMCE_ENABLED) static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, struct msr_data *msr) { uint64_t valid_bits; /* * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are * exposed to the guest. */ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & ~KVM_SUPPORTED_FEATURE_CONTROL); if (!msr->host_initiated && (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) return false; if (msr->host_initiated) valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; else valid_bits = vmx->msr_ia32_feature_control_valid_bits; return !(msr->data & ~valid_bits); } int vmx_get_feature_msr(u32 msr, u64 *data) { switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: return KVM_MSR_RET_UNSUPPORTED; } } /* * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; u32 index; switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE: msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; msr_info->data = vmx->msr_ia32_umwait_control; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated && !(vmx->msr_ia32_feature_control & FEAT_CTL_LMCE_ENABLED)) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated && !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) return 1; msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) return 1; #ifdef CONFIG_KVM_HYPERV /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V * versions are either trying to use them and fail or do some * sanity checking and refuse to boot. Filter all unsupported * features out. */ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); #endif break; case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: if (!vmx_pt_mode_is_host_guest() || !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output))) return 1; msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output))) return 1; msr_info->data = vmx->pt_desc.guest.output_mask; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() || (index >= 2 * vmx->pt_desc.num_address_ranges)) return 1; if (index % 2) msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_S_CET: msr_info->data = vmcs_readl(GUEST_S_CET); break; case MSR_KVM_INTERNAL_GUEST_SSP: msr_info->data = vmcs_readl(GUEST_SSP); break; case MSR_IA32_INT_SSP_TAB: msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); break; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmx_guest_debugctl_read(); break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; } return kvm_get_msr_common(vcpu, msr_info); } return 0; } static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, u64 data) { #ifdef CONFIG_X86_64 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return (u32)data; #endif return (unsigned long)data; } u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { u64 debugctl = 0; if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; if (boot_cpu_has(X86_FEATURE_RTM) && (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) debugctl |= DEBUGCTLMSR_RTM_DEBUG; return debugctl; } bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) { u64 invalid; invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); } return !invalid; } /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; u32 index; switch (msr_index) { case MSR_EFER: ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE: vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: vmx_write_guest_kernel_gs_base(vmx, data); break; case MSR_IA32_XFD: ret = kvm_set_msr_common(vcpu, msr_info); /* * Always intercepting WRMSR could incur non-negligible * overhead given xfd might be changed frequently in * guest context switch. Disable write interception * upon the first write with a non-zero value (indicating * potential usage on dynamic xfeatures). Also update * exception bitmap to trap #NM for proper virtualization * of guest xfd_err. */ if (!ret && data) { vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); vcpu->arch.xfd_no_write_intercept = true; vmx_update_exception_bitmap(vcpu); } break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu)) get_vmcs12(vcpu)->guest_sysenter_cs = data; vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: if (is_guest_mode(vcpu)) { data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_eip = data; } vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: if (is_guest_mode(vcpu)) { data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_esp = data; } vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1; data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; vmx_guest_debugctl_write(vcpu, data); if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); return 0; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; if (is_guest_mode(vcpu) && ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) get_vmcs12(vcpu)->guest_bndcfgs = data; vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; /* The reserved bit 1 and non-32 bit [63:32] should be zero */ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) return 1; vmx->msr_ia32_umwait_control = data; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) return 1; vmx->spec_ctrl = data; if (!data) break; /* * For non-nested: * When it's written (to non-zero) for the first time, pass * it through. * * For nested: * The handling of the MSR bitmap for L2 guests is done in * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. */ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; goto find_uret_msr; case MSR_IA32_CR_PAT: ret = kvm_set_msr_common(vcpu, msr_info); if (ret) break; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) get_vmcs12(vcpu)->guest_ia32_pat = data; if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, data); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & FEAT_CTL_LMCE_ENABLED)) || (data & ~MCG_EXT_CTL_LMCE_EN)) return 1; vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1; vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); /* SGX may be enabled/disabled by guest's firmware */ vmx_write_encls_bitmap(vcpu, NULL); break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: /* * On real hardware, the LE hash MSRs are writable before * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), * at which point SGX related bits in IA32_FEATURE_CONTROL * become writable. * * KVM does not emulate SGX activation for simplicity, so * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL * is unlocked. This is technically not architectural * behavior, but it's close enough. */ if (!msr_info->host_initiated && (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) return 1; vmx->msr_ia32_sgxlepubkeyhash [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest() || vmx_rtit_ctl_check(vcpu, data) || vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) return 1; if (data & MSR_IA32_RTIT_STATUS_MASK) return 1; vmx->pt_desc.guest.status = data; break; case MSR_IA32_RTIT_CR3_MATCH: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; vmx->pt_desc.guest.cr3_match = data; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; if (!pt_output_base_valid(vcpu, data)) return 1; vmx->pt_desc.guest.output_base = data; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; vmx->pt_desc.guest.output_mask = data; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: if (!pt_can_write_msr(vmx)) return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; if (is_noncanonical_msr_address(data, vcpu)) return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; else vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_S_CET: vmcs_writel(GUEST_S_CET, data); break; case MSR_KVM_INTERNAL_GUEST_SSP: vmcs_writel(GUEST_SSP, data); break; case MSR_IA32_INT_SSP_TAB: vmcs_writel(GUEST_INTR_SSP_TABLE, data); break; case MSR_IA32_PERF_CAPABILITIES: if (data & PERF_CAP_LBR_FMT) { if ((data & PERF_CAP_LBR_FMT) != (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; } if (data & PERF_CAP_PEBS_FORMAT) { if ((data & PERF_CAP_PEBS_MASK) != (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) return 1; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_index); if (msr) ret = vmx_set_guest_uret_msr(vmx, msr, data); else ret = kvm_set_msr_common(vcpu, msr_info); } /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ if (msr_index == MSR_IA32_ARCH_CAPABILITIES) vmx_update_fb_clear_dis(vcpu, vmx); return ret; } void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { unsigned long guest_owned_bits; kvm_register_mark_available(vcpu, reg); switch (reg) { case VCPU_REGS_RSP: vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); break; case VCPU_REGS_RIP: vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; case VCPU_EXREG_PDPTR: if (enable_ept) ept_save_pdptrs(vcpu); break; case VCPU_EXREG_CR0: guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; vcpu->arch.cr0 &= ~guest_owned_bits; vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: /* * When intercepting CR3 loads, e.g. for shadowing paging, KVM's * CR3 is loaded into hardware, not the guest's CR3. */ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; vcpu->arch.cr4 &= ~guest_owned_bits; vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: KVM_BUG_ON(1, vcpu->kvm); break; } } /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping * ENCLS whenever it's supported in hardware. It does not matter whether * the host OS supports or has enabled SGX. */ static bool cpu_has_sgx(void) { return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; rdmsr(msr, vmx_msr_low, vmx_msr_high); ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ /* Ensure minimum (required) set of control bits are supported. */ if (ctl_min & ~ctl) return -EIO; *result = ctl; return 0; } static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; rdmsrq(msr, allowed); return ctl_opt & allowed; } #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ ({ \ int i, r = 0; \ \ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ \ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ \ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ continue; \ \ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ "entry = %llx (%llx), exit = %llx (%llx)\n", \ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ \ if (error_on_inconsistent_vmcs_config) \ r = -EIO; \ \ entry_controls &= ~n_ctrl; \ exit_controls &= ~x_ctrl; \ } \ r; \ }) static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; u64 basic_msr; u64 misc_msr; /* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always * intercepts writes to PAT and EFER, i.e. never enables those controls. */ struct { u32 entry_control; u32 exit_control; } const vmcs_entry_exit_pairs[] = { { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control)) return -EIO; if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control)) return -EIO; } if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_2nd_exec_control &= ~( SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &vmx_cap->ept, &vmx_cap->vpid); if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && vmx_cap->ept) { pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); if (error_on_inconsistent_vmcs_config) return -EIO; vmx_cap->ept = 0; _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); if (error_on_inconsistent_vmcs_config) return -EIO; vmx_cap->vpid = 0; } if (!cpu_has_sgx()) _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) _cpu_based_3rd_exec_control = adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, MSR_IA32_VMX_PROCBASED_CTLS3); if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control)) return -EIO; if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control)) return -EIO; if (cpu_has_broken_vmx_preemption_timer()) _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control)) return -EIO; if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, _vmentry_control, _vmexit_control)) return -EIO; /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. */ switch (boot_cpu_data.x86_vfm) { case INTEL_NEHALEM_EP: /* AAK155 */ case INTEL_NEHALEM: /* AAP115 */ case INTEL_WESTMERE: /* AAT100 */ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ case INTEL_NEHALEM_EX: /* BA97 */ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); break; default: break; } rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 /* * KVM expects to be able to shove all legal physical addresses into * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always * 0 for processors that support Intel 64 architecture". */ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; rdmsrq(MSR_IA32_VMX_MISC, misc_msr); vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; #if IS_ENABLED(CONFIG_HYPERV) if (enlightened_vmcs) evmcs_sanitize_exec_ctrls(vmcs_conf); #endif return 0; } static bool __kvm_is_vmx_supported(void) { int cpu = smp_processor_id(); if (!(cpuid_ecx(1) & feature_bit(VMX))) { pr_err("VMX not supported by CPU %d\n", cpu); return false; } if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !this_cpu_has(X86_FEATURE_VMX)) { pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); return false; } return true; } static bool kvm_is_vmx_supported(void) { bool supported; migrate_disable(); supported = __kvm_is_vmx_supported(); migrate_enable(); return supported; } int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; if (!__kvm_is_vmx_supported()) return -EIO; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { pr_err("Failed to setup VMCS config on CPU %d\n", cpu); return -EIO; } if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { pr_err("Inconsistent VMCS config on CPU %d\n", cpu); return -EIO; } return 0; } static int kvm_cpu_vmxon(u64 vmxon_pointer) { u64 msr; cr4_set_bits(X86_CR4_VMXE); asm goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); return 0; fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; } int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); int r; if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; /* * This can happen if we hot-added a CPU but failed to allocate * VP assist page for it. */ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) return -EFAULT; intel_pt_handle_vmx(1); r = kvm_cpu_vmxon(phys_addr); if (r) { intel_pt_handle_vmx(0); return r; } return 0; } static void vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v, *n; list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) __loaded_vmcs_clear(v); } void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); if (kvm_cpu_vmxoff()) kvm_spurious_fault(); hv_reset_evmcs(); intel_pt_handle_vmx(0); } struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); struct page *pages; struct vmcs *vmcs; pages = __alloc_pages_node(node, flags, 0); if (!pages) return NULL; vmcs = page_address(pages); memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); /* KVM supports Enlightened VMCS v1 only */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); if (shadow) vmcs->hdr.shadow_vmcs = 1; return vmcs; } void free_vmcs(struct vmcs *vmcs) { free_page((unsigned long)vmcs); } /* * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded */ void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { if (!loaded_vmcs->vmcs) return; loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; if (loaded_vmcs->msr_bitmap) free_page((unsigned long)loaded_vmcs->msr_bitmap); WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; vmcs_clear(loaded_vmcs->vmcs); loaded_vmcs->shadow_vmcs = NULL; loaded_vmcs->hv_timer_soft_disabled = false; loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; if (cpu_has_vmx_msr_bitmap()) { loaded_vmcs->msr_bitmap = (unsigned long *) __get_free_page(GFP_KERNEL_ACCOUNT); if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); memset(&loaded_vmcs->controls_shadow, 0, sizeof(struct vmcs_controls_shadow)); return 0; out_vmcs: free_loaded_vmcs(loaded_vmcs); return -ENOMEM; } static void free_kvm_area(void) { int cpu; for_each_possible_cpu(cpu) { free_vmcs(per_cpu(vmxarea, cpu)); per_cpu(vmxarea, cpu) = NULL; } } static __init int alloc_kvm_area(void) { int cpu; for_each_possible_cpu(cpu) { struct vmcs *vmcs; vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); if (!vmcs) { free_kvm_area(); return -ENOMEM; } /* * When eVMCS is enabled, alloc_vmcs_cpu() sets * vmcs->revision_id to KVM_EVMCS_VERSION instead of * revision_id reported by MSR_IA32_VMX_BASIC. * * However, even though not explicitly documented by * TLFS, VMXArea passed as VMXON argument should * still be marked with revision_id reported by * physical CPU. */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); per_cpu(vmxarea, cpu) = vmcs; } return 0; } static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { if (!emulate_invalid_guest_state) { /* * CS and SS RPL should be equal during guest entry according * to VMX spec, but in reality it is not always so. Since vcpu * is in the middle of the transition from real mode to * protected mode it is safe to assume that RPL 0 is a good * default value. */ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) save->selector &= ~SEGMENT_RPL_MASK; save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Update real mode segment cache. It may be not up-to-date if segment * register was written while vcpu was in a guest mode. */ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); vmx->rmode.vm86_active = 0; __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); vmx_update_exception_bitmap(vcpu); fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); } static void fix_rmode_seg(int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment var = *save; var.dpl = 0x3; if (seg == VCPU_SREG_CS) var.type = 0x3; if (!emulate_invalid_guest_state) { var.selector = var.base >> 4; var.base = var.base & 0xffff0; var.limit = 0xffff; var.g = 0; var.db = 0; var.present = 1; var.s = 1; var.l = 0; var.unusable = 0; var.type = 0x3; var.avl = 0; if (save->base & 0xf) pr_warn_once("segment base is not paragraph aligned " "when entering protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); /* * KVM should never use VM86 to virtualize Real Mode when L2 is active, * as using VM86 is unnecessary if unrestricted guest is enabled, and * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 * should VM-Fail and KVM should reject userspace attempts to stuff * CR0.PG=0 when L2 is active. */ WARN_ON_ONCE(is_guest_mode(vcpu)); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); vmx->rmode.vm86_active = 1; vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); vmx_update_exception_bitmap(vcpu); fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* Nothing to do if hardware doesn't support EFER. */ if (!vmx_find_uret_msr(vmx, MSR_EFER)) return 0; vcpu->arch.efer = efer; #ifdef CONFIG_X86_64 if (efer & EFER_LMA) vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); else vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); #else if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) return 1; #endif vmx_setup_uret_msrs(vmx); return 0; } #ifdef CONFIG_X86_64 static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; vmx_segment_cache_clear(to_vmx(vcpu)); guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~VMX_AR_TYPE_MASK) | VMX_AR_TYPE_BUSY_64_TSS); } vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); } static void exit_lmode(struct kvm_vcpu *vcpu) { vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } #endif void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * INVEPT must be issued when EPT is enabled, irrespective of VPID, as * the CPU is not required to invalidate guest-physical mappings on * VM-Entry, even if VPID is disabled. Guest-physical mappings are * associated with the root EPT structure and not any particular VPID * (INVVPID also isn't required to invalidate guest-physical mappings). */ if (enable_ept) { ept_sync_global(); } else if (enable_vpid) { if (cpu_has_vmx_invvpid_global()) { vpid_sync_vcpu_global(); } else { vpid_sync_vcpu_single(vmx->vpid); vpid_sync_vcpu_single(vmx->nested.vpid02); } } } static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 root_hpa = mmu->root.hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return; if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->root_role.level)); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) return; if (is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (WARN_ON_ONCE(!is_pae_paging(vcpu))) return; mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (is_guest_mode(vcpu)) return nested_guest_cr0_valid(vcpu, cr0); if (to_vmx(vcpu)->nested.vmxon) return nested_host_cr0_valid(vcpu, cr0); return true; } void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0, old_cr0_pg; u32 tmp; old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (enable_unrestricted_guest) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; if (!enable_ept) hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); } vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif if (enable_ept && !enable_unrestricted_guest) { /* * Ensure KVM has an up-to-date snapshot of the guest's CR3. If * the below code _enables_ CR3 exiting, vmx_cache_reg() will * (correctly) stop reading vmcs.GUEST_CR3 because it thinks * KVM's CR3 is installed. */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) vmx_cache_reg(vcpu, VCPU_EXREG_CR3); /* * When running with EPT but not unrestricted guest, KVM must * intercept CR3 accesses when paging is _disabled_. This is * necessary because restricted guests can't actually run with * paging disabled, and so KVM stuffs its own CR3 in order to * run the guest when identity mapped page tables. * * Do _NOT_ check the old CR0.PG, e.g. to optimize away the * update, it may be stale with respect to CR3 interception, * e.g. after nested VM-Enter. * * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or * stores to forward them to L1, even if KVM does not need to * intercept them to preserve its identity mapped page tables. */ if (!(cr0 & X86_CR0_PG)) { exec_controls_setbit(vmx, CR3_EXITING_BITS); } else if (!is_guest_mode(vcpu)) { exec_controls_clearbit(vmx, CR3_EXITING_BITS); } else { tmp = exec_controls_get(vmx); tmp &= ~CR3_EXITING_BITS; tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; exec_controls_set(vmx, tmp); } /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); /* * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. */ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); } /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->vt.emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_ept_level(void) { if (cpu_has_vmx_ept_5levels()) return 5; return 4; } u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPTP_AD_ENABLE_BIT; eptp |= root_hpa; return eptp; } void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; if (enable_ept) { eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); hv_track_root_tdp(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) guest_cr3 = vcpu->arch.cr3; else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | kvm_get_active_cr3_lam_bits(vcpu); } if (update_guest_cr3) vmcs_writel(GUEST_CR3, guest_cr3); } bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be * enabled under SMM. Note, whether or not VMXE is allowed at all, * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return false; return true; } void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr4; /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; if (vmx_umip_emulated()) { if (cr4 & X86_CR4_UMIP) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; } else if (!is_guest_mode(vcpu) || !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); } } vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); if (!enable_unrestricted_guest) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } /* * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in * hardware. To emulate this behavior, SMEP/SMAP/PKU needs * to be manually disabled when guest switches to non-paging * mode. * * If !enable_unrestricted_guest, the CPU is always running * with CR0.PG=1 and CR4 needs to be modified. * If enable_unrestricted_guest, the CPU automatically * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ if (!is_paging(vcpu)) hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 ar; if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) return; var->base = vmx_read_guest_seg_base(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); return; } var->base = vmx_read_guest_seg_base(vmx, seg); var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); var->unusable = (ar >> 16) & 1; var->type = ar & 15; var->s = (ar >> 4) & 1; var->dpl = (ar >> 5) & 3; /* * Some userspaces do not preserve unusable property. Since usable * segment has to be present according to VMX spec we can use present * property to amend userspace bug by making unusable segment always * nonpresent. vmx_segment_access_rights() already marks nonpresent * segment as unusable. */ var->present = !var->unusable; var->avl = (ar >> 12) & 1; var->l = (ar >> 13) & 1; var->db = (ar >> 14) & 1; var->g = (ar >> 15) & 1; } u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment s; if (to_vmx(vcpu)->rmode.vm86_active) { vmx_get_segment(vcpu, &s, seg); return s.base; } return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ar; if (unlikely(vmx->rmode.vm86_active)) return 0; if (no_cache) ar = vmcs_read32(GUEST_SS_AR_BYTES); else ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); return VMX_AR_DPL(ar); } int vmx_get_cpl(struct kvm_vcpu *vcpu) { return __vmx_get_cpl(vcpu, false); } int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) { return __vmx_get_cpl(vcpu, true); } static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; ar = var->type & 15; ar |= (var->s & 1) << 4; ar |= (var->dpl & 3) << 5; ar |= (var->present & 1) << 7; ar |= (var->avl & 1) << 12; ar |= (var->l & 1) << 13; ar |= (var->db & 1) << 14; ar |= (var->g & 1) << 15; ar |= (var->unusable || !var->present) << 16; return ar; } void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; if (seg == VCPU_SREG_TR) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); /* * Fix the "Accessed" bit in AR field of segment registers for older * qemu binaries. * IA32 arch specifies that at the time of processor reset the * "Accessed" bit in the AR field of segment registers is 1. And qemu * is setting it to 0 in the userland code. This causes invalid guest * state vmexit when "unrestricted guest" mode is turned on. * Fix for this setup issue in cpu_reset is being pushed in the qemu * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); } void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); *db = (ar >> 14) & 1; *l = (ar >> 13) & 1; } void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_IDTR_LIMIT); dt->address = vmcs_readl(GUEST_IDTR_BASE); } void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_IDTR_LIMIT, dt->size); vmcs_writel(GUEST_IDTR_BASE, dt->address); } void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_GDTR_LIMIT); dt->address = vmcs_readl(GUEST_GDTR_BASE); } void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_GDTR_LIMIT, dt->size); vmcs_writel(GUEST_GDTR_BASE, dt->address); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment var; u32 ar; vmx_get_segment(vcpu, &var, seg); var.dpl = 0x3; if (seg == VCPU_SREG_CS) var.type = 0x3; ar = vmx_segment_access_rights(&var); if (var.base != (var.selector << 4)) return false; if (var.limit != 0xffff) return false; if (ar != 0xf3) return false; return true; } static bool code_segment_valid(struct kvm_vcpu *vcpu) { struct kvm_segment cs; unsigned int cs_rpl; vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); cs_rpl = cs.selector & SEGMENT_RPL_MASK; if (cs.unusable) return false; if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; } else { if (cs.dpl != cs_rpl) return false; } if (!cs.present) return false; /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ return true; } static bool stack_segment_valid(struct kvm_vcpu *vcpu) { struct kvm_segment ss; unsigned int ss_rpl; vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); ss_rpl = ss.selector & SEGMENT_RPL_MASK; if (ss.unusable) return true; if (ss.type != 3 && ss.type != 7) return false; if (!ss.s) return false; if (ss.dpl != ss_rpl) /* DPL != RPL */ return false; if (!ss.present) return false; return true; } static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment var; unsigned int rpl; vmx_get_segment(vcpu, &var, seg); rpl = var.selector & SEGMENT_RPL_MASK; if (var.unusable) return true; if (!var.s) return false; if (!var.present) return false; if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { if (var.dpl < rpl) /* DPL < RPL */ return false; } /* TODO: Add other members to kvm_segment_field to allow checking for other access * rights flags */ return true; } static bool tr_valid(struct kvm_vcpu *vcpu) { struct kvm_segment tr; vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); if (tr.unusable) return false; if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; if (!tr.present) return false; return true; } static bool ldtr_valid(struct kvm_vcpu *vcpu) { struct kvm_segment ldtr; vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); if (ldtr.unusable) return true; if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (ldtr.type != 2) return false; if (!ldtr.present) return false; return true; } static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ss; vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); return ((cs.selector & SEGMENT_RPL_MASK) == (ss.selector & SEGMENT_RPL_MASK)); } /* * Check if guest state is valid. Returns true if valid, false if * not. * We assume that registers are always usable */ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) return false; } else { /* protected mode guest state checks */ if (!cs_ss_rpl_check(vcpu)) return false; if (!code_segment_valid(vcpu)) return false; if (!stack_segment_valid(vcpu)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_DS)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_ES)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_FS)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_GS)) return false; if (!tr_valid(vcpu)) return false; if (!ldtr_valid(vcpu)) return false; } /* TODO: * - Add checks on RIP * - Add checks on RFLAGS */ return true; } static int init_rmode_tss(struct kvm *kvm, void __user *ua) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); u16 data; int i; for (i = 0; i < 3; i++) { if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) return -EFAULT; } data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) return -EFAULT; data = ~0; if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) return -EFAULT; return 0; } static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); int i, r = 0; void __user *uaddr; u32 tmp; /* Protect kvm_vmx->ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); if (likely(kvm_vmx->ept_identity_pagetable_done)) goto out; if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; uaddr = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (IS_ERR(uaddr)) { r = PTR_ERR(uaddr); goto out; } /* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { r = -EFAULT; goto out; } } kvm_vmx->ept_identity_pagetable_done = true; out: mutex_unlock(&kvm->slots_lock); return r; } static void seg_setup(int seg) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); ar = 0x93; if (seg == VCPU_SREG_CS) ar |= 0x08; /* code segment */ vmcs_write32(sf->ar_bytes, ar); } int allocate_vpid(void) { int vpid; if (!enable_vpid) return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); else vpid = 0; spin_unlock(&vmx_vpid_lock); return vpid; } void free_vpid(int vpid) { if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) { /* * When KVM is a nested hypervisor on top of Hyper-V and uses * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ if (kvm_is_using_evmcs()) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; if (evmcs->hv_enlightenments_control.msr_bitmap) evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; } vmx->nested.force_msr_bitmap_recalc = true; } void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; if (!cpu_has_vmx_msr_bitmap()) return; vmx_msr_bitmap_l01_changed(vmx); if (type & MSR_TYPE_R) { if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) vmx_clear_msr_bitmap_read(msr_bitmap, msr); else vmx_set_msr_bitmap_read(msr_bitmap, msr); } if (type & MSR_TYPE_W) { if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) vmx_clear_msr_bitmap_write(msr_bitmap, msr); else vmx_set_msr_bitmap_write(msr_bitmap, msr); } } static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { /* * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. */ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; const int write_idx = read_idx + (0x800 / sizeof(u64)); struct vcpu_vmx *vmx = to_vmx(vcpu); u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; u8 mode; if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) return; if (cpu_has_secondary_exec_ctrls() && (secondary_exec_controls_get(vmx) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { mode = MSR_BITMAP_MODE_X2APIC; if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) mode |= MSR_BITMAP_MODE_X2APIC_APICV; } else { mode = 0; } if (mode == vmx->x2apic_msr_bitmap_mode) return; vmx->x2apic_msr_bitmap_mode = mode; /* * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended * registers (0x840 and above) intercepted, KVM doesn't support them. * Intercept all writes by default and poke holes as needed. Pass * through reads for all valid registers by default in x2APIC+APICv * mode, only the current timer count needs on-demand emulation by KVM. */ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); else msr_bitmap[read_idx] = ~0ull; msr_bitmap[write_idx] = ~0ull; /* * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, !(mode & MSR_BITMAP_MODE_X2APIC)); if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); if (enable_ipiv) vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); u32 i; vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } } static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { bool intercept; if (!cpu_has_vmx_msr_bitmap()) return; vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); #ifdef CONFIG_X86_64 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); if (kvm_cstate_in_guest(vcpu->kvm)) { vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } if (kvm_aperfmperf_in_guest(vcpu->kvm)) { vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); } /* PT MSRs can be passed through iff PT is exposed to the guest. */ if (vmx_pt_mode_is_host_guest()) pt_update_intercept_for_msr(vcpu); if (vcpu->arch.xfd_no_write_intercept) vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, !to_vmx(vcpu)->spec_ctrl); if (kvm_cpu_cap_has(X86_FEATURE_XFD)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); if (cpu_feature_enabled(X86_FEATURE_IBPB)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, !guest_has_pred_cmd_msr(vcpu)); if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); } if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); } /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. */ } void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) { vmx_recalc_msr_intercepts(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated * and freed, and must not be accessed outside of vcpu->mutex. The * vCPU's cached PI NV is valid if and only if posted interrupts * enabled in its vmcs12, i.e. checking the vector also checks that * L1 has enabled posted interrupts for L2. */ if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); /* * This pairs with the smp_mb_*() after setting vcpu->mode in * vcpu_enter_guest() to guarantee the vCPU sees the event * request if triggering a posted interrupt "fails" because * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as * the smb_wmb() in kvm_make_request() only ensures everything * done before making the request is visible when the request * is visible, it doesn't ensure ordering between the store to * vcpu->requests and the load from vcpu->mode. */ smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; } /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt * notification to vcpu and hardware will sync PIR to vIRR atomically. * 2. If target vcpu isn't running(root mode), kick it to pick up the * interrupt from PIR in next vmentry. */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); if (!r) return 0; /* Note, this is called iff the local APIC is in-kernel. */ if (!vcpu->arch.apic->apicv_active) return -1; __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); return 0; } void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector) { struct kvm_vcpu *vcpu = apic->vcpu; if (vmx_deliver_posted_interrupt(vcpu, vector)) { kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); } } /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. * Note that host-state that does change is set elsewhere. E.g., host-state * that is set differently for each CPU is set in vmx_vcpu_load(), not here. */ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; unsigned long cr0, cr3, cr4; cr0 = read_cr0(); WARN_ON(cr0 & X86_CR0_TS); vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ /* * Save the most likely value for this task's CR3 in the VMCS. * We can't use __get_current_cr3_fast() because we're not atomic. */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in * vmx_prepare_switch_to_host(), in case userspace uses * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); #else vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #endif vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); /* * SYSENTER is used for 32-bit system calls on either 32-bit or * 64-bit kernels. It is always zero If neither is allowed, otherwise * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may * have already done so!). */ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { rdmsr(MSR_IA32_CR_PAT, low32, high32); vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } if (cpu_has_load_ia32_efer()) vmcs_write64(HOST_IA32_EFER, kvm_host.efer); /* * Supervisor shadow stack is not enabled on host side, i.e., * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM * description(RDSSP instruction), SSP is not readable in CPL0, * so resetting the two registers to 0s at VM-Exit does no harm * to kernel execution. When execution flow exits to userspace, * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter * 3 and 4 for details. */ if (cpu_has_load_cet_ctrl()) { vmcs_writel(HOST_S_CET, kvm_host.s_cet); vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); } } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) { vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; } if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; if (!enable_vnmi) pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; if (!enable_preemption_timer) pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } static u32 vmx_get_initial_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); /* * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. */ vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_IA32E_MODE); return vmentry_ctrl; } static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; /* * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for * nested virtualization and thus allowed to be set in vmcs12. */ vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_apicv_status = true; return; } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); secondary_exec_controls_changebit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, kvm_vcpu_apicv_active(vcpu)); if (enable_ipiv) tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, kvm_vcpu_apicv_active(vcpu)); vmx_update_msr_bitmap_x2apic(vcpu); } static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; /* * Not used by KVM, but fully supported for nesting, i.e. are allowed in * vmcs12 and propagated to vmcs02 when set in vmcs12. */ exec_control &= ~(CPU_BASED_RDTSC_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | CPU_BASED_PAUSE_EXITING); /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; if (!cpu_need_tpr_shadow(&vmx->vcpu)) exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 if (exec_control & CPU_BASED_TPR_SHADOW) exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING); else exec_control |= CPU_BASED_CR8_STORE_EXITING | CPU_BASED_CR8_LOAD_EXITING; #endif /* No need to intercept CR3 access or INVPLG when using EPT. */ if (enable_ept) exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); if (kvm_mwait_in_guest(vmx->vcpu.kvm)) exec_control &= ~(CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING); if (kvm_hlt_in_guest(vmx->vcpu.kvm)) exec_control &= ~CPU_BASED_HLT_EXITING; return exec_control; } static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) { u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; /* * IPI virtualization relies on APICv. Disable IPI virtualization if * APICv is inhibited. */ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) exec_control &= ~TERTIARY_EXEC_IPI_VIRT; return exec_control; } /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a * feature has been exposed to the guest in order to correctly emulate faults. */ static inline void vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, u32 control, bool enabled, bool exiting) { /* * If the control is for an opt-in feature, clear the control if the * feature is not exposed to the guest, i.e. not enabled. If the * control is opt-out, i.e. an exiting control, clear the control if * the feature _is_ exposed to the guest, i.e. exiting/interception is * disabled for the associated instruction. Note, the caller is * responsible presetting exec_control to set all supported bits. */ if (enabled == exiting) *exec_control &= ~control; /* * Update the nested MSR settings so that a nested VMM can/can't set * controls for features that are/aren't exposed to the guest. */ if (nested && kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { /* * All features that can be added or removed to VMX MSRs must * be supported in the first place for nested virtualization. */ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) enabled = false; if (enabled) vmx->nested.msrs.secondary_ctls_high |= control; else vmx->nested.msrs.secondary_ctls_high &= ~control; } } /* * Wrapper macro for the common case of adjusting a secondary execution control * based on a single guest CPUID bit, with a dedicated feature bit. This also * verifies that the control is actually supported by KVM and hardware. */ #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ ({ \ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ bool __enabled; \ \ if (cpu_has_vmx_##name()) { \ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ __enabled, exiting); \ } \ }) /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; if (vmx_pt_mode_is_system()) exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; enable_unrestricted_guest = 0; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (kvm_pause_in_guest(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; /* * KVM doesn't support VMFUNC for L1, but the control is set in KVM's * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. */ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD (handle_vmptrld). We can NOT enable shadow_vmcs here because we don't have yet a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; /* * PML is enabled/disabled when dirty logging of memsmlots changes, but * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled. */ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); /* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either * feature is exposed to the guest. This creates a virtualization hole * if both are supported in hardware but only one is exposed to the * guest, but letting the guest execute RDTSCP or RDPID when either one * is advertised is preferable to emulating the advertised instruction * in KVM on #UD, and obviously better than incorrectly injecting #UD. */ if (cpu_has_vmx_rdtscp()) { bool rdpid_or_rdtscp_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_ENABLE_RDTSCP, rdpid_or_rdtscp_enabled, false); } vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, ENABLE_USR_WAIT_PAUSE, false); if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; if (!kvm_notify_vmexit_enabled(vcpu->kvm)) exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; return exec_control; } static inline int vmx_get_pid_table_order(struct kvm *kvm) { return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); } static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) { struct page *pages; struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); if (!irqchip_in_kernel(kvm) || !enable_ipiv) return 0; if (kvm_vmx->pid_table) return 0; pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, vmx_get_pid_table_order(kvm)); if (!pages) return -ENOMEM; kvm_vmx->pid_table = (void *)page_address(pages); return 0; } int vmx_vcpu_precreate(struct kvm *kvm) { return vmx_alloc_ipiv_pid_table(kvm); } #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { struct kvm *kvm = vmx->vcpu.kvm; struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); exec_controls_set(vmx, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); } if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); vmcs_write64(EOI_EXIT_BITMAP3, 0); vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); } if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } if (kvm_notify_vmexit_enabled(kvm)) vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); if (cpu_has_vmx_xsaves()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); /* Bit[6~0] are forced to 1, writes are ignored. */ vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); vmx_guest_debugctl_write(&vmx->vcpu, 0); if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (cpu_need_tpr_shadow(&vmx->vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vmx->vcpu.arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } vmx_setup_uret_msrs(vmx); } static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); init_vmcs(vmx); if (nested && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); vcpu_setup_sgx_lepubkeyhash(vcpu); vmx->nested.posted_intr_nv = -1; vmx->nested.vmxon_ptr = INVALID_GPA; vmx->nested.current_vmptr = INVALID_GPA; #ifdef CONFIG_KVM_HYPERV vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; #endif if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; __pi_set_sn(&vmx->vt.pi_desc); } void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!init_event) __vmx_vcpu_reset(vcpu); vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); seg_setup(VCPU_SREG_FS); seg_setup(VCPU_SREG_GS); seg_setup(VCPU_SREG_SS); vmcs_write16(GUEST_TR_SELECTOR, 0); vmcs_writel(GUEST_TR_BASE, 0); vmcs_write32(GUEST_TR_LIMIT, 0xffff); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); vmcs_write16(GUEST_LDTR_SELECTOR, 0); vmcs_writel(GUEST_LDTR_BASE, 0); vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); vmx_segment_cache_clear(vmx); kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { vmcs_writel(GUEST_SSP, 0); vmcs_writel(GUEST_INTR_SSP_TABLE, 0); } if (kvm_cpu_cap_has(X86_FEATURE_IBT) || kvm_cpu_cap_has(X86_FEATURE_SHSTK)) vmcs_writel(GUEST_S_CET, 0); kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); vmx_update_fb_clear_dis(vcpu, vmx); } void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { vmx_enable_irq_window(vcpu); return; } exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); return; } intr = irq | INTR_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { intr |= INTR_TYPE_SOFT_INTR; vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); vmx_clear_hlt(vcpu); } void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!enable_vnmi) { /* * Tracking the NMI-blocked state in software is built upon * finding the next open IRQ window. This, in turn, depends on * well-behaving guests: They have to keep IRQs disabled at * least as long as the NMI handler runs. Otherwise we may * cause NMI nesting, maybe breaking the guest. But as this is * highly unlikely, we can live with the residual risk. */ vmx->loaded_vmcs->soft_vnmi_blocked = 1; vmx->loaded_vmcs->vnmi_blocked_time = 0; } ++vcpu->stat.nmi_injections; vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); vmx_clear_hlt(vcpu); } bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool masked; if (!enable_vnmi) return vmx->loaded_vmcs->soft_vnmi_blocked; if (vmx->loaded_vmcs->nmi_known_unmasked) return false; masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; vmx->loaded_vmcs->nmi_known_unmasked = !masked; return masked; } void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!enable_vnmi) { if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { vmx->loaded_vmcs->soft_vnmi_blocked = masked; vmx->loaded_vmcs->vnmi_blocked_time = 0; } } else { vmx->loaded_vmcs->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } } bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) return false; if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) return true; return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); } int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) return -EBUSY; return !vmx_nmi_blocked(vcpu); } bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return false; return __vmx_interrupt_blocked(vcpu); } int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return -EBUSY; return !vmx_interrupt_blocked(vcpu); } int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; if (enable_unrestricted_guest) return 0; mutex_lock(&kvm->slots_lock); ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, PAGE_SIZE * 3); mutex_unlock(&kvm->slots_lock); if (IS_ERR(ret)) return PTR_ERR(ret); to_kvm_vmx(kvm)->tss_addr = addr; return init_rmode_tss(kvm, ret); } int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; return 0; } static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { switch (vec) { case BP_VECTOR: /* * Update instruction length as we may reinject the exception * from user space while in guest debugging mode. */ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return false; fallthrough; case DB_VECTOR: return !(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: case UD_VECTOR: case DF_VECTOR: case SS_VECTOR: case GP_VECTOR: case MF_VECTOR: return true; } return false; } static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_emulate_halt_noskip(vcpu); } return 1; } return 0; } /* * Forward all other exceptions that are valid in real mode. * FIXME: Breaks guest debugging in real mode, needs to be fixed with * the required debugging infrastructure rework. */ kvm_queue_exception(vcpu, vec); return 1; } static int handle_machine_check(struct kvm_vcpu *vcpu) { /* handled by vmx_vcpu_run() */ return 1; } /* * If the host has split lock detection disabled, then #AC is * unconditionally injected into the guest, which is the pre split lock * detection behaviour. * * If the host has split lock detection enabled then #AC is * only injected into the guest when: * - Guest CPL == 3 (user mode) * - Guest has #AC detection enabled in CR0 * - Guest EFLAGS has AC bit set */ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) { if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) return true; return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.guest_fpu.fpstate->xfd && !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); } static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; unsigned long cr2, dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; intr_info = vmx_get_intr_info(vcpu); /* * Machine checks are handled by handle_exception_irqoff(), or by * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by * vmx_vcpu_enter_exit(). */ if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* * Queue the exception here instead of in handle_nm_fault_irqoff(). * This ensures the nested_vmx check is not skipped so vmexit can * be reflected to L1 (when it intercepts #NM) before reaching this * point. */ if (is_nm_fault(intr_info)) { kvm_queue_exception_p(vcpu, NM_VECTOR, is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1; } if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); if (WARN_ON_ONCE(is_ve_fault(intr_info))) { struct vmx_ve_information *ve_info = vmx->ve_info; WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); dump_vmcs(vcpu); kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); return 1; } error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); /* * VMware backdoor emulation on #GP interception only handles * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero * error code on #GP. */ if (error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; } return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } /* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing * MMIO, it is better to report an internal error. * See the comments in vmx_handle_exit. */ if ((vect_info & VECTORING_INFO_VALID_MASK) && !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0; } if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); if (enable_ept && !vcpu->arch.apf.host_apf_flags) { /* * EPT will cause page fault only if we need to * detect illegal GPAs. */ WARN_ON_ONCE(!allow_smaller_maxphyaddr); kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); return 1; } else return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { case DB_VECTOR: dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { /* * If the #DB was due to ICEBP, a.k.a. INT1, skip the * instruction. ICEBP generates a trap-like #DB, but * despite its interception control being tied to #DB, * is an instruction intercept, i.e. the VM-Exit occurs * on the ICEBP itself. Use the inner "skip" helper to * avoid single-step #DB and MTF updates, as ICEBP is * higher priority. Note, skipping ICEBP still clears * STI and MOVSS blocking. * * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS * if single-step is enabled in RFLAGS and STI or MOVSS * blocking is active, as the CPU doesn't set the bit * on VM-Exit due to #DB interception. VM-Entry has a * consistency check that a single-step #DB is pending * in this scenario as the previous instruction cannot * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV * don't modify RFLAGS), therefore the one instruction * delay when activating single-step breakpoints must * have already expired. Note, the CPU sets/clears BS * as appropriate for all other VM-Exits types. */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); fallthrough; case BP_VECTOR: /* * Update instruction length as we may reinject #BP from * user space while in guest debugging mode. Reading it for * #DB as well causes no harm, it is not used in that case. */ vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: if (vmx_guest_inject_ac(vcpu)) { kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; } /* * Handle split lock. Depending on detection mode this will * either warn and disable split lock detection for this * task or force SIGBUS on it. */ if (handle_guest_split_lock(kvm_rip_read(vcpu))) return 1; fallthrough; default: kvm_run->exit_reason = KVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; kvm_run->ex.error_code = error_code; break; } return 0; } static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; } static int handle_triple_fault(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; vcpu->mmio_needed = 0; return 0; } static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int size, in, string; unsigned port; exit_qualification = vmx_get_exit_qual(vcpu); string = (exit_qualification & 16) != 0; ++vcpu->stat.io_exits; if (string) return kvm_emulate_instruction(vcpu, 0); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; in = (exit_qualification & 8) != 0; return kvm_fast_pio(vcpu, size, port, in); } void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { /* * Patch in the VMCALL instruction: */ hypercall[0] = 0x0f; hypercall[1] = 0x01; hypercall[2] = 0xc1; } /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long orig_val = val; /* * We get here when L2 changed cr0 in a way that did not change * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), * but did change L0 shadowed bits. So we first calculate the * effective cr0 value that L1 would like to write into the * hardware. It consists of the L2-owned bits from the new * value combined with the L1-owned bits from L1's guest_cr0. */ val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); if (kvm_set_cr0(vcpu, val)) return 1; vmcs_writel(CR0_READ_SHADOW, orig_val); return 0; } else { return kvm_set_cr0(vcpu, val); } } static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long orig_val = val; /* analogously to handle_set_cr0 */ val = (val & ~vmcs12->cr4_guest_host_mask) | (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); if (kvm_set_cr4(vcpu, val)) return 1; vmcs_writel(CR4_READ_SHADOW, orig_val); return 0; } else return kvm_set_cr4(vcpu, val); } static int handle_desc(struct kvm_vcpu *vcpu) { /* * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this * and other code needs to be updated if UMIP can be guest owned. */ BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); return kvm_emulate_instruction(vcpu, 0); } static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; int err; int ret; exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ val = kvm_register_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: err = handle_set_cr0(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) return ret; if (cr8_prev <= cr8) return ret; /* * TODO: we might be squashing a * KVM_GUESTDBG_SINGLESTEP-triggered * KVM_EXIT_DEBUG here. */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } } break; case 2: /* clts */ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); kvm_lmsw(vcpu, val); return kvm_skip_emulated_instruction(vcpu); default: break; } vcpu->run->exit_reason = 0; vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int dr, dr7, reg; int err = 1; exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ if (!kvm_require_dr(vcpu, dr)) return 1; if (vmx_get_cpl(vcpu) > 0) goto out; dr7 = vmcs_readl(GUEST_DR7); if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); return 1; } } if (vcpu->guest_debug == 0) { exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); /* * No more DR vmexits; force a reload of the debug registers * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers. */ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; return 1; } reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); err = 0; } else { err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); } out: return kvm_complete_insn_gp(vcpu, err); } void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); get_debugreg(vcpu->arch.dr6, 6); vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); /* * exc_debug expects dr6 to be cleared after it runs, avoid that it sees * a stale dr6 from the guest. */ set_debugreg(DR6_RESERVED, 6); } void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); return 1; } static int handle_interrupt_window(struct kvm_vcpu *vcpu) { exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; return 1; } static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); kvm_mmu_invlpg(vcpu, exit_qualification); return kvm_skip_emulated_instruction(vcpu); } static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; offset = exit_qualification & APIC_ACCESS_OFFSET; /* * Sane guest uses MOV to write EOI, with written value * not cared. So make a short-circuit here by avoiding * heavy instruction emulation. */ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); return kvm_skip_emulated_instruction(vcpu); } } return kvm_emulate_instruction(vcpu, 0); } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ kvm_apic_set_eoi_accelerated(vcpu, vector); return 1; } static int handle_apic_write(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); /* * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and * hardware has done any necessary aliasing, offset adjustments, etc... * for the access. I.e. the correct value has already been written to * the vAPIC page for the correct 16-byte chunk. KVM needs only to * retrieve the register value and emulate the access. */ u32 offset = exit_qualification & 0xff0; kvm_apic_write_nodecode(vcpu, offset); return 1; } static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; bool has_error_code = false; u32 error_code = 0; u16 tss_selector; int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmx_get_exit_qual(vcpu); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: if (vmx->idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { has_error_code = true; error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE); } fallthrough; case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; default: break; } } tss_selector = exit_qualification; if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && type != INTR_TYPE_EXT_INTR && type != INTR_TYPE_NMI_INTR)) WARN_ON(!skip_emulated_instruction(vcpu)); /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? */ return kvm_task_switch(vcpu, tss_selector, type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, has_error_code, error_code); } static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. * There are errata that may cause this bit to not be set: * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because * if the illegal address is that of a paging structure, then * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. */ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); return kvm_skip_emulated_instruction(vcpu); } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); } static int handle_nmi_window(struct kvm_vcpu *vcpu) { if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) return -EIO; exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } /* * Returns true if emulation is required (due to the vCPU having invalid state * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the * current vCPU state. */ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!vmx->vt.emulation_required) return false; /* * It is architecturally impossible for emulation to be required when a * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if * guest state is invalid and unrestricted guest is disabled, i.e. KVM * should synthesize VM-Fail instead emulation L2 code. This path is * only reachable if userspace modifies L2 guest state after KVM has * performed the nested VM-Enter consistency checks. */ if (vmx->nested.nested_run_pending) return true; /* * KVM only supports emulating exceptions if the vCPU is in Real Mode. * If emulation is required, KVM can't perform a successful VM-Enter to * inject the exception. */ return !vmx->rmode.vm86_active && (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool intr_window_requested; unsigned count = 130; intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; /* * Ensure that any updates to kvm->buses[] observed by the * previous instruction (emulated or otherwise) are also * visible to the instruction KVM is about to emulate. */ smp_rmb(); if (!kvm_emulate_instruction(vcpu, 0)) return 0; if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_emulate_halt_noskip(vcpu); } /* * Note, return 1 and not 0, vcpu_run() will invoke * xfer_to_guest_mode() which will create a proper return * code. */ if (__xfer_to_guest_mode_work_pending()) return 1; } return 1; } int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } return 1; } /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ static int handle_pause(struct kvm_vcpu *vcpu) { if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); /* * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" * VM-execution control is ignored if CPL > 0. OTOH, KVM * never set PAUSE_EXITING and just set PLE if supported, * so the vcpu must be CPL=0 if it gets a PAUSE exit. */ kvm_vcpu_on_spin(vcpu, true); return kvm_skip_emulated_instruction(vcpu); } static int handle_monitor_trap(struct kvm_vcpu *vcpu) { return 1; } static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; unsigned long type; gva_t gva; struct { u64 pcid; u64 gla; } operand; int gpr_index; if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); type = kvm_register_read(vcpu, gpr_index); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; return kvm_handle_invpcid(vcpu, type, gva); } static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; trace_kvm_pml_full(vcpu->vcpu_id); exit_qualification = vmx_get_exit_qual(vcpu); /* * PML buffer FULL happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); /* * PML buffer already flushed at beginning of VMEXIT. Nothing to do * here.., and there's no userspace involvement needed for PML. */ return 1; } static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * In the *extremely* unlikely scenario that this is a spurious VM-Exit * due to the timer expiring while it was "soft" disabled, just eat the * exit and re-enter the guest. */ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) return EXIT_FASTPATH_REENTER_GUEST; /* * If the timer expired because KVM used it to force an immediate exit, * then mission accomplished. */ if (force_immediate_exit) return EXIT_FASTPATH_EXIT_HANDLED; /* * If L2 is active, go down the slow path as emulating the guest timer * expiration likely requires synthesizing a nested VM-Exit. */ if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; kvm_lapic_expired_hv_timer(vcpu); return EXIT_FASTPATH_REENTER_GUEST; } static int handle_preemption_timer(struct kvm_vcpu *vcpu) { /* * This non-fastpath handler is reached if and only if the preemption * timer was being used to emulate a guest timer while L2 is active. * All other scenarios are supposed to be handled in the fastpath. */ WARN_ON_ONCE(!is_guest_mode(vcpu)); kvm_lapic_expired_hv_timer(vcpu); return 1; } /* * When nested=0, all VMX instruction VM Exits filter here. The handlers * are overwritten by nested_vmx_hardware_setup() when nested=1. */ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* * SGX virtualization is disabled. There is no software enable bit for * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #endif /* CONFIG_X86_SGX_KVM */ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { /* * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } static int handle_notify(struct kvm_vcpu *vcpu) { unsigned long exit_qual = vmx_get_exit_qual(vcpu); bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; ++vcpu->stat.notify_window_exits; /* * Notify VM exit happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. */ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || context_invalid) { vcpu->run->exit_reason = KVM_EXIT_NOTIFY; vcpu->run->notify.flags = context_invalid ? KVM_NOTIFY_CONTEXT_INVALID : 0; return 0; } return 1; } static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) { return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); } static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) { return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), vmx_get_msr_imm_reg(vcpu)); } static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) { return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), vmx_get_msr_imm_reg(vcpu)); } /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, [EXIT_REASON_CPUID] = kvm_emulate_cpuid, [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = kvm_emulate_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, [EXIT_REASON_VMPTRST] = handle_vmx_instruction, [EXIT_REASON_VMREAD] = handle_vmx_instruction, [EXIT_REASON_VMRESUME] = handle_vmx_instruction, [EXIT_REASON_VMWRITE] = handle_vmx_instruction, [EXIT_REASON_VMOFF] = handle_vmx_instruction, [EXIT_REASON_VMON] = handle_vmx_instruction, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_GDTR_IDTR] = handle_desc, [EXIT_REASON_LDTR_TR] = handle_desc, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, [EXIT_REASON_INVEPT] = handle_vmx_instruction, [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); else *error_code = 0; } else { *info2 = 0; *intr_info = 0; *error_code = 0; } } void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) { *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); if (is_exception_with_error_code(*intr_info)) *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); else *error_code = 0; } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { __free_page(vmx->pml_pg); vmx->pml_pg = NULL; } } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u16 pml_idx, pml_tail_index; u64 *pml_buf; int i; pml_idx = vmcs_read16(GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ if (pml_idx == PML_HEAD_INDEX) return; /* * PML index always points to the next available PML buffer entity * unless PML log has just overflowed. */ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; /* * PML log is written backwards: the CPU first writes the entry 511 * then the entry 510, and so on. * * Read the entries in the same order they were written, to ensure that * the dirty ring is filled in the same order the CPU wrote them. */ pml_buf = page_address(vmx->pml_pg); for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { u64 gpa; gpa = pml_buf[i]; WARN_ON(gpa & (PAGE_SIZE - 1)); kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); } static void vmx_dump_dtsel(char *name, uint32_t limit) { pr_err("%s limit=0x%08x, base=0x%016lx\n", name, vmcs_read32(limit), vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } static void vmx_dump_msrs(char *name, struct vmx_msrs *m) { unsigned int i; struct vmx_msr_entry *e; pr_err("MSR %s:\n", name); for (i = 0, e = m->val; i < m->nr; ++i, ++e) pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); } void dump_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; u64 tertiary_exec_control; unsigned long cr4; int efer_slot; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); return; } vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); else secondary_exec_control = 0; if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); else tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), vmcs_readl(CR0_GUEST_HOST_MASK)); pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); if (cpu_has_vmx_ept()) { pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); } pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", vmcs_readl(GUEST_SYSENTER_ESP), vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); else if (efer_slot >= 0) pr_err("EFER= 0x%016llx (autoload)\n", vmx->msr_autoload.guest.val[efer_slot].value); else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) pr_err("EFER= 0x%016llx (effective)\n", vcpu->arch.efer | (EFER_LMA | EFER_LME)); else pr_err("EFER= 0x%016llx (effective)\n", vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); if (cpu_has_load_perf_global_ctrl() && vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); pr_err("Interruptibility = %08x ActivityState = %08x\n", vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), vmcs_read32(GUEST_ACTIVITY_STATE)); if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) pr_err("InterruptStatus = %04x\n", vmcs_read16(GUEST_INTR_STATUS)); if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), vmcs_readl(GUEST_INTR_SSP_TABLE)); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), vmcs_read16(HOST_TR_SELECTOR)); pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), vmcs_readl(HOST_TR_BASE)); pr_err("GDTBase=%016lx IDTBase=%016lx\n", vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), vmcs_readl(HOST_CR4)); pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", vmcs_readl(HOST_IA32_SYSENTER_ESP), vmcs_read32(HOST_IA32_SYSENTER_CS), vmcs_readl(HOST_IA32_SYSENTER_EIP)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); if (cpu_has_load_perf_global_ctrl() && vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), vmcs_readl(HOST_INTR_SSP_TABLE)); pr_err("*** Control State ***\n"); pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", vmcs_read32(VM_EXIT_INTR_INFO), vmcs_read32(VM_EXIT_INTR_ERROR_CODE), vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); pr_err(" reason=%08x qualification=%016lx\n", vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); pr_err("IDTVectoring: info=%08x errcode=%08x\n", vmcs_read32(IDT_VECTORING_INFO_FIELD), vmcs_read32(IDT_VECTORING_ERROR_CODE)); pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { u16 status = vmcs_read16(GUEST_INTR_STATUS); pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); } pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", vmcs_read16(VIRTUAL_PROCESSOR_ID)); if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { struct vmx_ve_information *ve_info = vmx->ve_info; u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); /* * If KVM is dumping the VMCS, then something has gone wrong * already. Derefencing an address from the VMCS, which could * very well be corrupted, is a terrible idea. The virtual * address is known so use it. */ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", ve_info->exit_reason, ve_info->delivery, ve_info->exit_qualification, ve_info->guest_linear_address, ve_info->guest_physical_address, ve_info->eptp_index); } } /* * The guest has exited. See if we can fix it or if we need userspace * assistance. */ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been * flushed already. Note, PML is never enabled in hardware while * running L2. */ if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); /* * KVM should never reach this point with a pending nested VM-Enter. * More specifically, short-circuiting VM-Entry to emulate L2 due to * invalid guest state should never happen as that means KVM knowingly * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a * PML full exit occurs as something is horribly wrong. */ if (exit_reason.basic == EXIT_REASON_PML_FULL) goto unexpected_vmexit; /* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC * Page). The CPU may write to these pages via their host * physical address while L2 is running, bypassing any * address-translation-based dirty tracking (e.g. EPT write * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. */ nested_mark_vmcs12_pages_dirty(vcpu); /* * Synthesize a triple fault if L2 state is invalid. In normal * operation, nested VM-Enter rejects any attempt to enter L2 * with invalid state. However, those checks are skipped if * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If * L2 state is invalid, it means either L1 modified SMRAM state * or userspace provided bad state. Synthesize TRIPLE_FAULT as * doing so is architecturally allowed in the RSM case, and is * the least awful solution for the userspace case without * risking false positives. */ if (vmx->vt.emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } if (nested_vmx_reflect_vmexit(vcpu)) return 1; } /* If guest state is invalid, start emulating. L2 is handled above. */ if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } if (unlikely(vmx->fail)) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH && exit_reason.basic != EXIT_REASON_NOTIFY && exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); return 0; } if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) { if (!vmx_interrupt_blocked(vcpu)) { vmx->loaded_vmcs->soft_vnmi_blocked = 0; } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { /* * This CPU don't support us in finding the end of an * NMI-blocked window if the guest runs with IRQs * disabled. So we pull the trigger after 1 s of * futile waiting, but inform the user about this. */ printk(KERN_WARNING "%s: Breaking out of NMI-blocked " "state on VCPU %d after 1 s timeout\n", __func__, vcpu->vcpu_id); vmx->loaded_vmcs->soft_vnmi_blocked = 0; } } if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; if (exit_reason.basic >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) return handle_wrmsr_imm(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) return handle_interrupt_window(vcpu); else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) return handle_external_interrupt(vcpu); else if (exit_reason.basic == EXIT_REASON_HLT) return kvm_emulate_halt(vcpu); else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) return handle_ept_misconfig(vcpu); #endif exit_handler_index = array_index_nospec((u16)exit_reason.basic, kvm_vmx_max_exit_handlers); if (!kvm_vmx_exit_handlers[exit_handler_index]) goto unexpected_vmexit; return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason.full); dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason.full; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { int ret = __vmx_handle_exit(vcpu, exit_fastpath); /* * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; return 0; } return ret; } /* * Software based L1D cache flush which is used when microcode providing * the cache control MSR is not loaded. * * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to * flush it is required to read in 64 KiB because the replacement algorithm * is not exactly LRU. This could be sized at runtime via topology * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; /* * This code is only executed when the flush mode is 'cond' or * 'always' */ if (static_branch_likely(&vmx_l1d_flush_cond)) { bool flush_l1d; /* * Clear the per-vcpu flush bit, it gets set again if the vCPU * is reloaded, i.e. if the vCPU is scheduled out or if KVM * exits to userspace, or if KVM reaches one of the unsafe * VMEXIT handlers, e.g. if KVM calls into the emulator. */ flush_l1d = vcpu->arch.l1tf_flush_l1d; vcpu->arch.l1tf_flush_l1d = false; /* * Clear the per-cpu flush bit, it gets set again from * the interrupt handlers. */ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); kvm_clear_cpu_l1tf_flush_l1d(); if (!flush_l1d) return; } vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } asm volatile( /* First ensure the pages are in the TLB */ "xorl %%eax, %%eax\n" ".Lpopulate_tlb:\n\t" "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" "addl $4096, %%eax\n\t" "cmpl %%eax, %[size]\n\t" "jne .Lpopulate_tlb\n\t" "xorl %%eax, %%eax\n\t" "cpuid\n\t" /* Now fill the cache */ "xorl %%eax, %%eax\n" ".Lfill_cache:\n" "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" "addl $64, %%eax\n\t" "cmpl %%eax, %[size]\n\t" "jne .Lfill_cache\n\t" "lfence\n" :: [flush_pages] "r" (vmx_l1d_flush_pages), [size] "r" (size) : "eax", "ebx", "ecx", "edx"); } void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; if (is_guest_mode(vcpu) && nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return; tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; if (is_guest_mode(vcpu)) to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; else vmcs_write32(TPR_THRESHOLD, tpr_threshold); } void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 sec_exec_control; if (!lapic_in_kernel(vcpu)) return; if (!flexpriority_enabled && !cpu_has_vmx_virtualize_x2apic_mode()) return; /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { vmx->nested.change_vmcs01_virtual_apic_mode = true; return; } sec_exec_control = secondary_exec_controls_get(vmx); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); switch (kvm_get_apic_mode(vcpu)) { case LAPIC_MODE_INVALID: WARN_ONCE(true, "Invalid local APIC state"); break; case LAPIC_MODE_DISABLED: break; case LAPIC_MODE_XAPIC: if (flexpriority_enabled) { sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Flush the TLB, reloading the APIC access page will * only do so if its physical address has changed, but * the guest may have inserted a non-APIC mapping into * the TLB while the APIC access page was disabled. */ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } break; case LAPIC_MODE_X2APIC: if (cpu_has_vmx_virtualize_x2apic_mode()) sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; break; } secondary_exec_controls_set(vmx, sec_exec_control); vmx_update_msr_bitmap_x2apic(vcpu); } void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; struct page *refcounted_page; unsigned long mmu_seq; kvm_pfn_t pfn; bool writable; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; return; } if (!(secondary_exec_controls_get(to_vmx(vcpu)) & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) return; /* * Explicitly grab the memslot using KVM's internal slot ID to ensure * KVM doesn't unintentionally grab a userspace memslot. It _should_ * be impossible for userspace to create a memslot for the APIC when * APICv is enabled, but paranoia won't hurt in this case. */ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return; /* * Ensure that the mmu_notifier sequence count is read before KVM * retrieves the pfn from the primary MMU. Note, the memslot is * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() * in kvm_mmu_invalidate_end(). */ mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* * No need to retry if the memslot does not exist or is invalid. KVM * controls the APIC-access page memslot, and only deletes the memslot * if APICv is permanently inhibited, i.e. the memslot won't reappear. */ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); if (is_error_noslot_pfn(pfn)) return; read_lock(&vcpu->kvm->mmu_lock); if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); else vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); /* * Do not pin the APIC access page in memory so that it can be freely * migrated, the MMU notifier will call us again if it is migrated or * swapped out. KVM backs the memslot with anonymous memory, the pfn * should always point at a refcounted page (if the pfn is valid). */ if (!WARN_ON_ONCE(!refcounted_page)) kvm_release_page_clean(refcounted_page); /* * No need for a manual TLB flush at this point, KVM has already done a * flush if there were SPTEs pointing at the previous page. */ read_unlock(&vcpu->kvm->mmu_lock); } void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; /* * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI * is only relevant for if and only if Virtual Interrupt Delivery is * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested * VM-Exit, otherwise L1 with run with a stale SVI. */ if (is_guest_mode(vcpu)) { /* * KVM is supposed to forward intercepted L2 EOIs to L1 if VID * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. * Note, userspace can stuff state while L2 is active; assert * that VID is disabled if and only if the vCPU is in KVM_RUN * to avoid false positives if userspace is setting APIC state. */ WARN_ON_ONCE(vcpu->wants_to_run && nested_cpu_has_vid(get_vmcs12(vcpu))); to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; return; } if (max_isr == -1) max_isr = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = status >> 8; if (max_isr != old) { status &= 0xff; status |= max_isr << 8; vmcs_write16(GUEST_INTR_STATUS, status); } } static void vmx_set_rvi(int vector) { u16 status; u8 old; if (vector == -1) vector = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { status &= ~0xff; status |= (u8)vector; vmcs_write16(GUEST_INTR_STATUS, status); } } int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vt->pi_desc)) { pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; } /* * Newly recognized interrupts are injected via either virtual interrupt * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is * disabled in two cases: * * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected * into L2, but KVM doesn't use virtual interrupt delivery to inject * interrupts into L2, and so KVM_REQ_EVENT is again needed. * * 2) If APICv is disabled for this vCPU, assigned devices may still * attempt to post interrupts. The posted interrupt vector will cause * a VM-Exit and the subsequent entry will call sync_pir_to_irr. */ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) vmx_set_rvi(max_irr); else if (got_posted_interrupt) kvm_make_request(KVM_REQ_EVENT, vcpu); return max_irr; } void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { /* * Save xfd_err to guest_fpu before interrupt is enabled, so the * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM * interception may have been caused by L1 interception. Per the SDM, * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. * unlike CR2 and DR6, the value is not a payload that is attached to * the #NM exception. */ if (is_xfd_nm_fault(vcpu)) rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) { /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* if exit due to NM, handle before interrupts are enabled */ else if (is_nm_fault(intr_info)) handle_nm_fault_irqoff(vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) { unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; /* * Invoke the kernel's IRQ handler for the vector. Use the FRED path * when it's available even if FRED isn't fully enabled, e.g. even if * FRED isn't supported in hardware, in order to avoid the indirect * CALL in the non-FRED path. */ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); if (IS_ENABLED(CONFIG_X86_FRED)) fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); else vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; } void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { if (to_vt(vcpu)->emulation_required) return; if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. */ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: if (!IS_ENABLED(CONFIG_KVM_SMM)) return false; /* * We cannot do SMM unless we can run the guest in big * real mode. */ return enable_unrestricted_guest || emulate_invalid_guest_state; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: case MSR_AMD64_TSC_RATIO: /* This is AMD only. */ return false; default: return true; } } static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; bool unblock_nmi; u8 vector; bool idtv_info_valid; idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return; exit_intr_info = vmx_get_intr_info(&vmx->vcpu); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* * SDM 3: 27.7.1.2 (September 2008) * Re-set bit "block by NMI" before VM entry if vmexit caused by * a guest IRET fault. * SDM 3: 23.2.2 (September 2008) * Bit 12 is undefined in any of the following cases: * If the VM exit sets the valid bit in the IDT-vectoring * information field. * If the VM exit is due to a double fault. */ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->loaded_vmcs->entry_time)); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, u32 idt_vectoring_info, int instr_len_field, int error_code_field) { u8 vector; int type; bool idtv_info_valid; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vcpu->arch.nmi_injected = false; kvm_clear_exception_queue(vcpu); kvm_clear_interrupt_queue(vcpu); if (!idtv_info_valid) return; kvm_make_request(KVM_REQ_EVENT, vcpu); vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = true; /* * SDM 3: 27.7.1.2 (September 2008) * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ vmx_set_nmi_mask(vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; case INTR_TYPE_HARD_EXCEPTION: { u32 error_code = 0; if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(error_code_field); kvm_requeue_exception(vcpu, vector, idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, error_code); break; } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); break; default: break; } } static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); } void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, VM_ENTRY_EXCEPTION_ERROR_CODE); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); pmu->host_cross_mapped_mask = 0; if (pmu->pebs_enable & pmu->global_ctrl) intel_pmu_cross_mapped_check(pmu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; for (i = 0; i < nr_msrs; i++) if (msrs[i].host == msrs[i].guest) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, msrs[i].host, false); } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; if (force_immediate_exit) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (vmx->hv_deadline_tsc != -1) { tscl = rdtsc(); if (vmx->hv_deadline_tsc > tscl) /* set_hv_timer ensures the delta fits in 32-bits */ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> cpu_preemption_timer_multi); else delta_tsc = 0; vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); vmx->loaded_vmcs->hv_timer_soft_disabled = true; } } void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; vmcs_writel(HOST_RSP, host_rsp); } } void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags) { u64 hostval = this_cpu_read(x86_spec_ctrl_current); if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. * * For legacy IBRS, the IBRS bit always needs to be written after * transitioning from a less privileged predictor mode, regardless of * whether the guest/host values differ. */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, bool force_immediate_exit) { /* * If L2 is active, some VMX preemption timer exits can be handled in * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_wrmsr(vcpu); case EXIT_REASON_MSR_WRITE_IMM: return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), vmx_get_msr_imm_reg(vcpu)); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); case EXIT_REASON_HLT: return handle_fastpath_hlt(vcpu); case EXIT_REASON_INVD: return handle_fastpath_invd(vcpu); default: return EXIT_FASTPATH_NONE; } } noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) { if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || !is_nmi(vmx_get_intr_info(vcpu))) return; kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); if (cpu_feature_enabled(X86_FEATURE_FRED)) fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); else vmx_do_nmi_irqoff(); kvm_after_interrupt(vcpu); } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned int flags) { struct vcpu_vmx *vmx = to_vmx(vcpu); guest_state_enter_irqoff(); /* * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. * * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, * and is affected by MMIO Stale Data. In such cases mitigation in only * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&cpu_buf_vm_clear) && (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) x86_clear_cpu_buffers(); vmx_disable_fb_clear(vmx); if (vcpu->arch.cr2 != native_read_cr2()) native_write_cr2(vcpu->arch.cr2); vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, flags); vcpu->arch.cr2 = native_read_cr2(); vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; vmx->idt_vectoring_info = 0; vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { vmx->vt.exit_reason.full = 0xdead; goto out; } vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_handle_nmi(vcpu); out: guest_state_exit_irqoff(); } fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->entry_time = ktime_get(); /* * Don't enter VMX if guest state is invalid, let the exit handler * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->vt.emulation_required)) { vmx->fail = 0; vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } trace_kvm_entry(vcpu, force_immediate_exit); if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; vmcs_write32(PLE_WINDOW, vmx->ple_window); } /* * We did this in prepare_switch_to_guest, because it needs to * be within srcu_read_lock. */ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; if (run_flags & KVM_RUN_LOAD_GUEST_DR6) set_debugreg(vcpu->arch.dr6, 6); if (run_flags & KVM_RUN_LOAD_DEBUGCTL) vmx_reload_guest_debugctl(vcpu); /* * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time * it switches back to the current->mm, which can occur in KVM context * when switching to a temporary mm to patch kernel code, e.g. if KVM * toggles a static key while handling a VM-Exit. */ cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug * exceptions being set, but that's not correct for the guest debugging * case. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); kvm_load_guest_xsave_state(vcpu); pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); if (intel_pmu_lbr_is_enabled(vcpu)) vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) vmx_update_hv_timer(vcpu, force_immediate_exit); else if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); kvm_wait_lapic_expire(vcpu); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (kvm_is_using_evmcs()) { current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vcpu->arch.host_debugctl) update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * * We can't defer this to vmx_prepare_switch_to_host() since that * function may be executed in interrupt context, which saves and * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); #endif pt_guest_exit(vmx); kvm_load_host_xsave_state(vcpu); if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state * checking. */ if (vmx->nested.nested_run_pending && !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; } if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (enable_pml) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); free_page((unsigned long)vmx->ve_info); } int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; int i, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; vmx->vpid = allocate_vpid(); /* * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by * avoiding dealing with cases, such as enabling PML partially on vcpus * for the guest), etc. */ if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto free_vpid; } for (i = 0; i < kvm_nr_uret_msrs; ++i) vmx->guest_uret_msrs[i].mask = -1ull; if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. * Keep the host value unchanged to avoid changing CPUID bits * under the host kernel's feet. */ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (tsx_ctrl) tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; } err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_pml; /* * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the * feature only for vmcs01, KVM currently isn't equipped to realize any * performance benefits from enabling it for vmcs02. */ if (kvm_is_using_evmcs() && (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; evmcs->hv_enlightenments_control.msr_bitmap = 1; } vmx->loaded_vmcs = &vmx->vmcs01; if (cpu_need_virtualize_apic_accesses(vcpu)) { err = kvm_alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } if (enable_ept && !enable_unrestricted_guest) { err = init_rmode_identity_map(vcpu->kvm); if (err) goto free_vmcs; } err = -ENOMEM; if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { struct page *page; BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); /* ve_info must be page aligned. */ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) goto free_vmcs; vmx->ve_info = page_to_virt(page); } if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); free_vpid: free_vpid(vmx->vpid); return err; } #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: /* * Warn upon starting the first VM in a potentially * insecure environment. */ if (sched_smt_active()) pr_warn_once(L1TF_MSG_SMT); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) pr_warn_once(L1TF_MSG_L1D); break; case L1TF_MITIGATION_FULL_FORCE: /* Flush is enforced */ break; } } if (enable_pml) kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } static inline bool vmx_ignore_guest_pat(struct kvm *kvm) { /* * Non-coherent DMA devices need the guest to flush CPU properly. * In that case it is not possible to map all guest RAM as WB, so * always trust guest PAT. */ return !kvm_arch_has_noncoherent_dma(kvm) && kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); } u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* * Force UC for host MMIO regions, as allowing the guest to access MMIO * with cacheable accesses will result in Machine Checks. */ if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; /* Force WB if ignoring guest PAT */ if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field * are dynamic, the others are mostly based on the hypervisor * architecture and the guest's CPUID. Do not touch the * dynamic bits. */ u32 mask = SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); } /* * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits * (indicating "allowed-1") if they are supported in the guest's CPUID. */ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *entry; vmx->nested.msrs.cr0_fixed1 = 0xffffffff; vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ if (entry && (entry->_reg & (_cpuid_mask))) \ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); #undef cr4_fixed1_update } static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *best = NULL; int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; } /* Get the number of configurable Address Ranges for filtering */ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges); /* Initialize and clear the no dependency bits */ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | RTIT_CTL_BRANCH_EN); /* * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise * will inject an #GP */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; /* * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and * PSBFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); /* * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | RTIT_CTL_MTC_RANGE); /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | RTIT_CTL_PTW_EN); /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; /* unmask address range configure area */ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * XSAVES is effectively enabled if and only if XSAVE is also exposed * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported. */ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); vmx_setup_uret_msrs(vmx); if (cpu_has_secondary_exec_ctrls()) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; else vmx->msr_ia32_feature_control_valid_bits &= ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_LC_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PERF_CAP_FW_WRITES; u64 host_perf_cap = 0; if (!enable_pmu) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); /* * KVM requires LBR callstack support, as the overhead due to * context switching LBRs without said support is too high. * See intel_pmu_create_guest_lbr_event() for more info. */ if (!vmx_lbr_caps.has_callstack) memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; } if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; /* * Disallow adaptive PEBS as it is functionally broken, can be * used by the guest to read *host* LBRs, and can be used to * bypass userspace event filters. To correctly and safely * support adaptive PEBS, KVM needs to: * * 1. Account for the ADAPTIVE flag when (re)programming fixed * counters. * * 2. Gain support from perf (or take direct control of counter * programming) to support events without adaptive PEBS * enabled for the hardware counter. * * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. * * 4. Document which PMU events are effectively exposed to the * guest via adaptive PEBS, and make adaptive PEBS mutually * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. */ perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; } static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); /* CPUID 0x1 */ if (nested) kvm_cpu_cap_set(X86_FEATURE_VMX); /* CPUID 0x7 */ if (kvm_mpx_supported()) kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); if (!cpu_has_vmx_invpcid()) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); if (vmx_pebs_supported()) { kvm_cpu_cap_check_and_set(X86_FEATURE_DS); kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); } if (!enable_pmu) kvm_cpu_cap_clear(X86_FEATURE_PDCM); kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); kvm_cpu_cap_clear(X86_FEATURE_SGX1); kvm_cpu_cap_clear(X86_FEATURE_SGX2); kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); } if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); /* CPUID 0x80000001 and 0x7 (RDPID) */ if (!cpu_has_vmx_rdtscp()) { kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); kvm_cpu_cap_clear(X86_FEATURE_RDPID); } if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); /* * Disable CET if unrestricted_guest is unsupported as KVM doesn't * enforce CET HW behaviors in emulator. On platforms with * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code * fails, so disable CET in this case too. */ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || !cpu_has_vmx_basic_no_hw_errcode_cc()) { kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); } } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; int size; bool imm; /* * If the 'use IO bitmaps' VM-execution control is 0, IO instruction * VM-exits depend on the 'unconditional IO exiting' VM-execution * control. * * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. */ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; imm = info->src_type == OP_IMM; } else { port = info->dst_val; size = info->src_bytes; imm = info->dst_type == OP_IMM; } *exit_qualification = ((unsigned long)port << 16) | (size - 1); if (info->intercept == x86_intercept_ins || info->intercept == x86_intercept_outs) *exit_qualification |= BIT(4); if (info->rep_prefix) *exit_qualification |= BIT(5); if (imm) *exit_qualification |= BIT(6); return nested_vmx_check_io_bitmaps(vcpu, port, size); } int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long exit_qualification = 0; u32 vm_exit_reason; u64 exit_insn_len; switch (info->intercept) { case x86_intercept_rdpid: /* * RDPID causes #UD if not enabled through secondary execution * controls (ENABLE_RDTSCP). Note, the implicit MSR access to * TSC_AUX is NOT subject to interception, i.e. checking only * the dedicated execution control is architecturally correct. */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) return X86EMUL_CONTINUE; vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; break; case x86_intercept_lgdt: case x86_intercept_lidt: case x86_intercept_lldt: case x86_intercept_ltr: case x86_intercept_sgdt: case x86_intercept_sidt: case x86_intercept_sldt: case x86_intercept_str: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; if (info->intercept == x86_intercept_lldt || info->intercept == x86_intercept_ltr || info->intercept == x86_intercept_sldt || info->intercept == x86_intercept_str) vm_exit_reason = EXIT_REASON_LDTR_TR; else vm_exit_reason = EXIT_REASON_GDTR_IDTR; /* * FIXME: Decode the ModR/M to generate the correct exit * qualification for memory operands. */ break; case x86_intercept_hlt: if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) return X86EMUL_CONTINUE; vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: /* * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides * with vanilla NOPs in the emulator. Apply the interception * check only to actual PAUSE instructions. Don't check * PAUSE-loop-exiting, software can't expect a given PAUSE to * exit, i.e. KVM is within its rights to allow L2 to execute * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: return X86EMUL_UNHANDLEABLE; } exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) return X86EMUL_UNHANDLEABLE; __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, exit_insn_len); return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ static inline int u64_shl_div_u64(u64 a, unsigned int shift, u64 divisor, u64 *result) { u64 low = a << shift, high = a >> (64 - shift); /* To avoid the overflow on divq */ if (high >= divisor) return 1; /* Low hold the result, high hold rem which is discarded */ asm("divq %2\n\t" : "=a" (low), "=d" (high) : "rm" (divisor), "0" (low), "1" (high)); *result = low; return 0; } int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; vmx = to_vmx(vcpu); tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; lapic_timer_advance_cycles = nsec_to_cycles(vcpu, ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; else delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* * If the delta tsc can't fit in the 32 bit after the multi shift, * we can't use the preemption timer. * It's possible that it fits on later vmentries, but checking * on every vmentry is costly so we just use an hrtimer. */ if (delta_tsc >> (cpu_preemption_timer_multi + 32)) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; *expired = !delta_tsc; return 0; } void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (WARN_ON_ONCE(!enable_pml)) return; if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_cpu_dirty_logging = true; return; } /* * Note, nr_memslots_dirty_logging can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value. */ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEAT_CTL_LMCE_ENABLED; else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_LMCE_ENABLED; } #ifdef CONFIG_KVM_SMM int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; return !is_smm(vcpu); } int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * TODO: Implement custom flows for forcing the vCPU out/in of L2 on * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong * SMI and RSM only modify state that is saved and restored via SMRAM. * E.g. most MSRs are left untouched, but many are modified by VM-Exit * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); vmx->nested.smm.vmxon = vmx->nested.vmxon; vmx->nested.vmxon = false; vmx_clear_hlt(vcpu); return 0; } int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; if (vmx->nested.smm.vmxon) { vmx->nested.vmxon = true; vmx->nested.smm.vmxon = false; } if (vmx->nested.smm.guest_mode) { ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) return ret; vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; } void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } #endif bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } void vmx_migrate_timers(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; if (hrtimer_try_to_cancel(timer) == 1) hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } } void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); if (nested) nested_vmx_hardware_unsetup(); free_kvm_area(); } void vmx_vm_destroy(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); } /* * Note, the SDM states that the linear address is masked *after* the modified * canonicality check, whereas KVM masks (untags) the address and then performs * a "normal" canonicality check. Functionally, the two methods are identical, * and when the masking occurs relative to the canonicality check isn't visible * to software, i.e. KVM's behavior doesn't violate the SDM. */ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) { int lam_bit; unsigned long cr3_bits; if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) return gva; if (!is_64_bit_mode(vcpu)) return gva; /* * Bit 63 determines if the address should be treated as user address * or a supervisor address. */ if (!(gva & BIT_ULL(63))) { cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) return gva; /* LAM_U48 is ignored if LAM_U57 is set. */ lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; } else { if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) return gva; lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; } /* * Untag the address by sign-extending the lam_bit, but NOT to bit 63. * Bit 63 is retained from the raw virtual address so that untagging * doesn't change a user access to a supervisor access, and vice versa. */ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); } static unsigned int vmx_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); /* '0' on failure so that the !PT case can use a RET0 static call. */ if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) return 0; kvm_make_request(KVM_REQ_PMI, vcpu); __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, (unsigned long *)&vcpu->arch.pmu.global_status); return 1; } static __init void vmx_setup_user_return_msrs(void) { /* * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, MSR_STAR is included in the list for i386, * but is never loaded into hardware. MSR_CSTAR is also never loaded * into hardware and is here purely for emulation purposes. */ const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, MSR_IA32_TSX_CTRL, }; int i; BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } static void __init vmx_setup_me_spte_mask(void) { u64 me_mask = 0; /* * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, * boot_cpu_data.x86_phys_bits holds the actual physical address * w/o the KeyID bits, and kvm_host.maxphyaddr equals to * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. */ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, kvm_host.maxphyaddr - 1); /* * Unlike SME, host kernel doesn't support setting up any * MKTME KeyID on Intel platforms. No memory encryption * bits should be included into the SPTE. */ kvm_mmu_set_me_spte_mask(0, me_mask); } __init int vmx_hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; int r; store_idt(&dt); host_idt_base = dt.address; vmx_setup_user_return_msrs(); if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || !cpu_has_vmx_ept_mt_wb() || !cpu_has_vmx_invept_global()) enable_ept = 0; /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } /* * Shadow paging doesn't have a (further) performance penalty * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it * by default */ if (!enable_ept) allow_smaller_maxphyaddr = true; if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; if (!cpu_has_virtual_nmis()) enable_vnmi = 0; #ifdef CONFIG_X86_SGX_KVM if (!cpu_has_vmx_encls_vmexit()) enable_sgx = false; #endif /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) vt_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) vt_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; ple_window_grow = 0; ple_window_max = 0; ple_window_shrink = 0; } if (!cpu_has_vmx_apicv()) enable_apicv = 0; if (!enable_apicv) vt_x86_ops.sync_pir_to_irr = NULL; if (!enable_apicv || !cpu_has_vmx_ipiv()) enable_ipiv = false; if (cpu_has_vmx_tsc_scaling()) kvm_caps.has_tsc_control = true; kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; kvm_caps.tsc_scaling_ratio_frac_bits = 48; kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); else vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID * bits to shadow_zero_check. */ vmx_setup_me_spte_mask(); kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), ept_caps_to_lpage_level(vmx_capability.ept)); /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. */ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; if (enable_preemption_timer) { u64 use_timer_freq = 5000ULL * 1000 * 1000; cpu_preemption_timer_multi = vmx_misc_preemption_timer_rate(vmcs_config.misc); if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; use_timer_freq >>= cpu_preemption_timer_multi; /* * KVM "disables" the preemption timer by setting it to its max * value. Don't use the timer if it might cause spurious exits * at a rate faster than 0.1 Hz (of uninterrupted guest time). */ if (use_timer_freq > 0xffffffffu / 10) enable_preemption_timer = false; } if (!enable_preemption_timer) { vt_x86_ops.set_hv_timer = NULL; vt_x86_ops.cancel_hv_timer = NULL; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; else vt_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); vmx_set_cpu_caps(); /* * Configure nested capabilities after core CPU capabilities so that * nested support can be conditional on base support, e.g. so that KVM * can hide/show features based on kvm_cpu_cap_has(). */ if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; } r = alloc_kvm_area(); if (r && nested) nested_vmx_hardware_unsetup(); kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); /* * On Intel CPUs that lack self-snoop feature, letting the guest control * memory types may result in unexpected behavior. So always ignore guest * PAT on those CPUs and map VM as writeback, not allowing userspace to * disable the quirk. * * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is * supported, UC is slow enough to cause issues with some older guests (e.g. * an old version of bochs driver uses ioremap() instead of ioremap_wc() to * map the video RAM, causing wayland desktop to fail to get started * correctly). To avoid breaking those older guests that rely on KVM to force * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the * safer (for performance) default behavior. * * On top of this, non-coherent DMA devices need the guest to flush CPU * caches properly. This also requires honoring guest PAT, and is forced * independent of the quirk in vmx_ignore_guest_pat(). */ if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); vmx_l1d_flush_pages = NULL; } /* Restore state so sysfs ignores VMX */ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } void vmx_exit(void) { allow_smaller_maxphyaddr = false; vmx_cleanup_l1d_flush(); kvm_x86_vendor_exit(); } int __init vmx_init(void) { int r, cpu; KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); if (!kvm_is_vmx_supported()) return -EOPNOTSUPP; /* * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, * i.e. there's nothing to unwind if a later step fails. */ hv_init_evmcs(); /* * Parse the VMCS config and VMX capabilities before anything else, so * that the information is available to all setup flows. */ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; /* * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); if (r) goto err_l1d_flush; for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); pi_init_cpu(cpu); } vmx_check_vmcs12_offsets(); return 0; err_l1d_flush: kvm_x86_vendor_exit(); return r; } |
| 55 81 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | // SPDX-License-Identifier: GPL-2.0-only /* * This file provides /sys/class/ieee80211/<wiphy name>/ * and some default attributes. * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2021, 2023-2024 Intel Corporation */ #include <linux/device.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "sysfs.h" #include "core.h" #include "rdev-ops.h" static inline struct cfg80211_registered_device *dev_to_rdev( struct device *dev) { return container_of(dev, struct cfg80211_registered_device, wiphy.dev); } #define SHOW_FMT(name, fmt, member) \ static ssize_t name ## _show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } \ static DEVICE_ATTR_RO(name) SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); static ssize_t addresses_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; char *start = buf; int i; if (!wiphy->addresses) return sprintf(buf, "%pM\n", wiphy->perm_addr); for (i = 0; i < wiphy->n_addresses; i++) buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr); return buf - start; } static DEVICE_ATTR_RO(addresses); static struct attribute *ieee80211_attrs[] = { &dev_attr_index.attr, &dev_attr_macaddress.attr, &dev_attr_address_mask.attr, &dev_attr_addresses.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(ieee80211); static void wiphy_dev_release(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); cfg80211_dev_free(rdev); } #ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_leave(rdev, wdev); } static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); } cfg80211_process_wiphy_works(rdev, NULL); if (rdev->ops->suspend) ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config); if (ret == 1) { /* Driver refuse to configure wowlan */ cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); cfg80211_process_wiphy_works(rdev, NULL); ret = rdev_suspend(rdev, NULL); } if (ret == 0) rdev->suspended = true; } wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; } static int wiphy_resume(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; /* Age scan results with time spent in suspend */ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; queue_work(system_unbound_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return ret; } static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); #define WIPHY_PM_OPS (&wiphy_pm_ops) #else #define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(const struct device *d) { struct wiphy *wiphy = container_of(d, struct wiphy, dev); return wiphy_net(wiphy); } struct class ieee80211_class = { .name = "ieee80211", .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) { return class_register(&ieee80211_class); } void wiphy_sysfs_exit(void) { class_unregister(&ieee80211_class); } |
| 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | // SPDX-License-Identifier: GPL-2.0-only /* * xt_u32 - kernel module to match u32 packet content * * Original author: Don Cohen <don@isis.cs3-inc.com> * (C) CC Computer Consultants GmbH, 2007 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_u32.h> static bool u32_match_it(const struct xt_u32 *data, const struct sk_buff *skb) { const struct xt_u32_test *ct; unsigned int testind; unsigned int nnums; unsigned int nvals; unsigned int i; __be32 n; u_int32_t pos; u_int32_t val; u_int32_t at; /* * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17" * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands. */ for (testind = 0; testind < data->ntests; ++testind) { ct = &data->tests[testind]; at = 0; pos = ct->location[0].number; if (skb->len < 4 || pos > skb->len - 4) return false; if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0) BUG(); val = ntohl(n); nnums = ct->nnums; /* Inner loop runs over "&", "<<", ">>" and "@" operands */ for (i = 1; i < nnums; ++i) { u_int32_t number = ct->location[i].number; switch (ct->location[i].nextop) { case XT_U32_AND: val &= number; break; case XT_U32_LEFTSH: val <<= number; break; case XT_U32_RIGHTSH: val >>= number; break; case XT_U32_AT: if (at + val < at) return false; at += val; pos = number; if (at + 4 < at || skb->len < at + 4 || pos > skb->len - at - 4) return false; if (skb_copy_bits(skb, at + pos, &n, sizeof(n)) < 0) BUG(); val = ntohl(n); break; } } /* Run over the "," and ":" operands */ nvals = ct->nvalues; for (i = 0; i < nvals; ++i) if (ct->value[i].min <= val && val <= ct->value[i].max) break; if (i >= ct->nvalues) return false; } return true; } static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_u32 *data = par->matchinfo; bool ret; ret = u32_match_it(data, skb); return ret ^ data->invert; } static int u32_mt_checkentry(const struct xt_mtchk_param *par) { const struct xt_u32 *data = par->matchinfo; const struct xt_u32_test *ct; unsigned int i; if (data->ntests > ARRAY_SIZE(data->tests)) return -EINVAL; for (i = 0; i < data->ntests; ++i) { ct = &data->tests[i]; if (ct->nnums > ARRAY_SIZE(ct->location) || ct->nvalues > ARRAY_SIZE(ct->value)) return -EINVAL; } return 0; } static struct xt_match xt_u32_mt_reg __read_mostly = { .name = "u32", .revision = 0, .family = NFPROTO_UNSPEC, .match = u32_mt, .checkentry = u32_mt_checkentry, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }; static int __init u32_mt_init(void) { return xt_register_match(&xt_u32_mt_reg); } static void __exit u32_mt_exit(void) { xt_unregister_match(&xt_u32_mt_reg); } module_init(u32_mt_init); module_exit(u32_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: arbitrary byte matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_u32"); MODULE_ALIAS("ip6t_u32"); |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 | // SPDX-License-Identifier: GPL-2.0-only /* * Linux driver for digital TV devices equipped with B2C2 FlexcopII(b)/III * flexcop-usb.c - covers the USB part * see flexcop.c for copyright information */ #define FC_LOG_PREFIX "flexcop_usb" #include "flexcop-usb.h" #include "flexcop-common.h" /* Version information */ #define DRIVER_VERSION "0.1" #define DRIVER_NAME "Technisat/B2C2 FlexCop II/IIb/III Digital TV USB Driver" #define DRIVER_AUTHOR "Patrick Boettcher <patrick.boettcher@posteo.de>" /* debug */ #ifdef CONFIG_DVB_B2C2_FLEXCOP_DEBUG #define dprintk(level, args...) \ do { if ((debug & (level))) printk(args); } while (0) #define debug_dump(b, l, method) do {\ int i; \ for (i = 0; i < l; i++) \ method("%02x ", b[i]); \ method("\n"); \ } while (0) #define DEBSTATUS "" #else #define dprintk(level, args...) no_printk(args) #define debug_dump(b, l, method) do { } while (0) #define DEBSTATUS " (debugging is not enabled)" #endif static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (1=info,ts=2,ctrl=4,i2c=8,v8mem=16 (or-able))." DEBSTATUS); #undef DEBSTATUS #define deb_info(args...) dprintk(0x01, args) #define deb_ts(args...) dprintk(0x02, args) #define deb_ctrl(args...) dprintk(0x04, args) #define deb_i2c(args...) dprintk(0x08, args) #define deb_v8(args...) dprintk(0x10, args) /* JLP 111700: we will include the 1 bit gap between the upper and lower 3 bits * in the IBI address, to make the V8 code simpler. * PCI ADDRESS FORMAT: 0x71C -> 0000 0111 0001 1100 (the six bits used) * in general: 0000 0HHH 000L LL00 * IBI ADDRESS FORMAT: RHHH BLLL * * where R is the read(1)/write(0) bit, B is the busy bit * and HHH and LLL are the two sets of three bits from the PCI address. */ #define B2C2_FLEX_PCIOFFSET_TO_INTERNALADDR(usPCI) (u8) \ (((usPCI >> 2) & 0x07) + ((usPCI >> 4) & 0x70)) #define B2C2_FLEX_INTERNALADDR_TO_PCIOFFSET(ucAddr) (u16) \ (((ucAddr & 0x07) << 2) + ((ucAddr & 0x70) << 4)) /* * DKT 020228 * - forget about this VENDOR_BUFFER_SIZE, read and write register * deal with DWORD or 4 bytes, that should be should from now on * - from now on, we don't support anything older than firm 1.00 * I eliminated the write register as a 2 trip of writing hi word and lo word * and force this to write only 4 bytes at a time. * NOTE: this should work with all the firmware from 1.00 and newer */ static int flexcop_usb_readwrite_dw(struct flexcop_device *fc, u16 wRegOffsPCI, u32 *val, u8 read) { struct flexcop_usb *fc_usb = fc->bus_specific; u8 request = read ? B2C2_USB_READ_REG : B2C2_USB_WRITE_REG; u8 request_type = (read ? USB_DIR_IN : USB_DIR_OUT) | USB_TYPE_VENDOR; u8 wAddress = B2C2_FLEX_PCIOFFSET_TO_INTERNALADDR(wRegOffsPCI) | (read ? 0x80 : 0); int ret; mutex_lock(&fc_usb->data_mutex); if (!read) memcpy(fc_usb->data, val, sizeof(*val)); ret = usb_control_msg(fc_usb->udev, read ? B2C2_USB_CTRL_PIPE_IN : B2C2_USB_CTRL_PIPE_OUT, request, request_type, /* 0xc0 read or 0x40 write */ wAddress, 0, fc_usb->data, sizeof(u32), B2C2_WAIT_FOR_OPERATION_RDW); if (ret != sizeof(u32)) { err("error while %s dword from %d (%d).", read ? "reading" : "writing", wAddress, wRegOffsPCI); if (ret >= 0) ret = -EIO; } if (read && ret >= 0) memcpy(val, fc_usb->data, sizeof(*val)); mutex_unlock(&fc_usb->data_mutex); return ret; } /* * DKT 010817 - add support for V8 memory read/write and flash update */ static int flexcop_usb_v8_memory_req(struct flexcop_usb *fc_usb, flexcop_usb_request_t req, u8 page, u16 wAddress, u8 *pbBuffer, u32 buflen) { u8 request_type = USB_TYPE_VENDOR; u16 wIndex; int nWaitTime, pipe, ret; wIndex = page << 8; if (buflen > sizeof(fc_usb->data)) { err("Buffer size bigger than max URB control message\n"); return -EIO; } switch (req) { case B2C2_USB_READ_V8_MEM: nWaitTime = B2C2_WAIT_FOR_OPERATION_V8READ; request_type |= USB_DIR_IN; pipe = B2C2_USB_CTRL_PIPE_IN; break; case B2C2_USB_WRITE_V8_MEM: wIndex |= pbBuffer[0]; request_type |= USB_DIR_OUT; nWaitTime = B2C2_WAIT_FOR_OPERATION_V8WRITE; pipe = B2C2_USB_CTRL_PIPE_OUT; break; case B2C2_USB_FLASH_BLOCK: request_type |= USB_DIR_OUT; nWaitTime = B2C2_WAIT_FOR_OPERATION_V8FLASH; pipe = B2C2_USB_CTRL_PIPE_OUT; break; default: deb_info("unsupported request for v8_mem_req %x.\n", req); return -EINVAL; } deb_v8("v8mem: %02x %02x %04x %04x, len: %d\n", request_type, req, wAddress, wIndex, buflen); mutex_lock(&fc_usb->data_mutex); if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT) memcpy(fc_usb->data, pbBuffer, buflen); ret = usb_control_msg(fc_usb->udev, pipe, req, request_type, wAddress, wIndex, fc_usb->data, buflen, nWaitTime); if (ret != buflen) ret = -EIO; if (ret >= 0) { ret = 0; if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN) memcpy(pbBuffer, fc_usb->data, buflen); } mutex_unlock(&fc_usb->data_mutex); debug_dump(pbBuffer, ret, deb_v8); return ret; } #define bytes_left_to_read_on_page(paddr, buflen) \ ((V8_MEMORY_PAGE_SIZE - (paddr & V8_MEMORY_PAGE_MASK)) > buflen \ ? buflen : (V8_MEMORY_PAGE_SIZE - (paddr & V8_MEMORY_PAGE_MASK))) static int flexcop_usb_memory_req(struct flexcop_usb *fc_usb, flexcop_usb_request_t req, flexcop_usb_mem_page_t page_start, u32 addr, int extended, u8 *buf, u32 len) { int ret = 0; u16 wMax; u32 pagechunk = 0; switch (req) { case B2C2_USB_READ_V8_MEM: wMax = USB_MEM_READ_MAX; break; case B2C2_USB_WRITE_V8_MEM: wMax = USB_MEM_WRITE_MAX; break; case B2C2_USB_FLASH_BLOCK: wMax = USB_FLASH_MAX; break; default: return -EINVAL; } while (len) { pagechunk = min(wMax, bytes_left_to_read_on_page(addr, len)); deb_info("%x\n", (addr & V8_MEMORY_PAGE_MASK) | (V8_MEMORY_EXTENDED*extended)); ret = flexcop_usb_v8_memory_req(fc_usb, req, page_start + (addr / V8_MEMORY_PAGE_SIZE), (addr & V8_MEMORY_PAGE_MASK) | (V8_MEMORY_EXTENDED*extended), buf, pagechunk); if (ret < 0) return ret; addr += pagechunk; buf += pagechunk; len -= pagechunk; } return 0; } static int flexcop_usb_get_mac_addr(struct flexcop_device *fc, int extended) { return flexcop_usb_memory_req(fc->bus_specific, B2C2_USB_READ_V8_MEM, V8_MEMORY_PAGE_FLASH, 0x1f010, 1, fc->dvb_adapter.proposed_mac, 6); } /* usb i2c stuff */ static int flexcop_usb_i2c_req(struct flexcop_i2c_adapter *i2c, flexcop_usb_request_t req, flexcop_usb_i2c_function_t func, u8 chipaddr, u8 addr, u8 *buf, u8 buflen) { struct flexcop_usb *fc_usb = i2c->fc->bus_specific; u16 wValue, wIndex; int nWaitTime, pipe, ret; u8 request_type = USB_TYPE_VENDOR; if (buflen > sizeof(fc_usb->data)) { err("Buffer size bigger than max URB control message\n"); return -EIO; } switch (func) { case USB_FUNC_I2C_WRITE: case USB_FUNC_I2C_MULTIWRITE: case USB_FUNC_I2C_REPEATWRITE: /* DKT 020208 - add this to support special case of DiSEqC */ case USB_FUNC_I2C_CHECKWRITE: pipe = B2C2_USB_CTRL_PIPE_OUT; nWaitTime = 2000; request_type |= USB_DIR_OUT; break; case USB_FUNC_I2C_READ: case USB_FUNC_I2C_REPEATREAD: pipe = B2C2_USB_CTRL_PIPE_IN; nWaitTime = 2000; request_type |= USB_DIR_IN; break; default: deb_info("unsupported function for i2c_req %x\n", func); return -EINVAL; } wValue = (func << 8) | (i2c->port << 4); wIndex = (chipaddr << 8 ) | addr; deb_i2c("i2c %2d: %02x %02x %02x %02x %02x %02x\n", func, request_type, req, wValue & 0xff, wValue >> 8, wIndex & 0xff, wIndex >> 8); mutex_lock(&fc_usb->data_mutex); if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT) memcpy(fc_usb->data, buf, buflen); ret = usb_control_msg(fc_usb->udev, pipe, req, request_type, wValue, wIndex, fc_usb->data, buflen, nWaitTime); if (ret != buflen) ret = -EIO; if (ret >= 0) { ret = 0; if ((request_type & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN) memcpy(buf, fc_usb->data, buflen); } mutex_unlock(&fc_usb->data_mutex); return ret; } /* actual bus specific access functions, make sure prototype are/will be equal to pci */ static flexcop_ibi_value flexcop_usb_read_ibi_reg(struct flexcop_device *fc, flexcop_ibi_register reg) { flexcop_ibi_value val; val.raw = 0; flexcop_usb_readwrite_dw(fc, reg, &val.raw, 1); return val; } static int flexcop_usb_write_ibi_reg(struct flexcop_device *fc, flexcop_ibi_register reg, flexcop_ibi_value val) { return flexcop_usb_readwrite_dw(fc, reg, &val.raw, 0); } static int flexcop_usb_i2c_request(struct flexcop_i2c_adapter *i2c, flexcop_access_op_t op, u8 chipaddr, u8 addr, u8 *buf, u16 len) { if (op == FC_READ) return flexcop_usb_i2c_req(i2c, B2C2_USB_I2C_REQUEST, USB_FUNC_I2C_READ, chipaddr, addr, buf, len); else return flexcop_usb_i2c_req(i2c, B2C2_USB_I2C_REQUEST, USB_FUNC_I2C_WRITE, chipaddr, addr, buf, len); } static void flexcop_usb_process_frame(struct flexcop_usb *fc_usb, u8 *buffer, int buffer_length) { u8 *b; int l; deb_ts("tmp_buffer_length=%d, buffer_length=%d\n", fc_usb->tmp_buffer_length, buffer_length); if (fc_usb->tmp_buffer_length > 0) { memcpy(fc_usb->tmp_buffer+fc_usb->tmp_buffer_length, buffer, buffer_length); fc_usb->tmp_buffer_length += buffer_length; b = fc_usb->tmp_buffer; l = fc_usb->tmp_buffer_length; } else { b = buffer; l = buffer_length; } while (l >= 190) { if (*b == 0xff) { switch (*(b+1) & 0x03) { case 0x01: /* media packet */ if (*(b+2) == 0x47) flexcop_pass_dmx_packets( fc_usb->fc_dev, b+2, 1); else deb_ts("not ts packet %*ph\n", 4, b+2); b += 190; l -= 190; break; default: deb_ts("wrong packet type\n"); l = 0; break; } } else { deb_ts("wrong header\n"); l = 0; } } if (l > 0) memcpy(fc_usb->tmp_buffer, b, l); fc_usb->tmp_buffer_length = l; } static void flexcop_usb_urb_complete(struct urb *urb) { struct flexcop_usb *fc_usb = urb->context; int i; if (urb->actual_length > 0) deb_ts("urb completed, bufsize: %d actlen; %d\n", urb->transfer_buffer_length, urb->actual_length); for (i = 0; i < urb->number_of_packets; i++) { if (urb->iso_frame_desc[i].status < 0) { err("iso frame descriptor %d has an error: %d\n", i, urb->iso_frame_desc[i].status); } else if (urb->iso_frame_desc[i].actual_length > 0) { deb_ts("passed %d bytes to the demux\n", urb->iso_frame_desc[i].actual_length); flexcop_usb_process_frame(fc_usb, urb->transfer_buffer + urb->iso_frame_desc[i].offset, urb->iso_frame_desc[i].actual_length); } urb->iso_frame_desc[i].status = 0; urb->iso_frame_desc[i].actual_length = 0; } usb_submit_urb(urb, GFP_ATOMIC); } static int flexcop_usb_stream_control(struct flexcop_device *fc, int onoff) { /* submit/kill iso packets */ return 0; } static void flexcop_usb_transfer_exit(struct flexcop_usb *fc_usb) { int i; for (i = 0; i < B2C2_USB_NUM_ISO_URB; i++) if (fc_usb->iso_urb[i] != NULL) { deb_ts("unlinking/killing urb no. %d\n", i); usb_kill_urb(fc_usb->iso_urb[i]); usb_free_urb(fc_usb->iso_urb[i]); } usb_free_coherent(fc_usb->udev, fc_usb->buffer_size, fc_usb->iso_buffer, fc_usb->dma_addr); } static int flexcop_usb_transfer_init(struct flexcop_usb *fc_usb) { struct usb_host_interface *alt = fc_usb->uintf->cur_altsetting; u16 frame_size; int bufsize, i, j, ret; int buffer_offset = 0; frame_size = usb_endpoint_maxp(&alt->endpoint[0].desc); bufsize = B2C2_USB_NUM_ISO_URB * B2C2_USB_FRAMES_PER_ISO * frame_size; deb_ts("creating %d iso-urbs with %d frames each of %d bytes size = %d.\n", B2C2_USB_NUM_ISO_URB, B2C2_USB_FRAMES_PER_ISO, frame_size, bufsize); fc_usb->iso_buffer = usb_alloc_coherent(fc_usb->udev, bufsize, GFP_KERNEL, &fc_usb->dma_addr); if (fc_usb->iso_buffer == NULL) return -ENOMEM; memset(fc_usb->iso_buffer, 0, bufsize); fc_usb->buffer_size = bufsize; /* creating iso urbs */ for (i = 0; i < B2C2_USB_NUM_ISO_URB; i++) { fc_usb->iso_urb[i] = usb_alloc_urb(B2C2_USB_FRAMES_PER_ISO, GFP_KERNEL); if (fc_usb->iso_urb[i] == NULL) { ret = -ENOMEM; goto urb_error; } } /* initialising and submitting iso urbs */ for (i = 0; i < B2C2_USB_NUM_ISO_URB; i++) { int frame_offset = 0; struct urb *urb = fc_usb->iso_urb[i]; deb_ts("initializing and submitting urb no. %d (buf_offset: %d).\n", i, buffer_offset); urb->dev = fc_usb->udev; urb->context = fc_usb; urb->complete = flexcop_usb_urb_complete; urb->pipe = B2C2_USB_DATA_PIPE; urb->transfer_flags = URB_ISO_ASAP; urb->interval = 1; urb->number_of_packets = B2C2_USB_FRAMES_PER_ISO; urb->transfer_buffer_length = frame_size * B2C2_USB_FRAMES_PER_ISO; urb->transfer_buffer = fc_usb->iso_buffer + buffer_offset; buffer_offset += frame_size * B2C2_USB_FRAMES_PER_ISO; for (j = 0; j < B2C2_USB_FRAMES_PER_ISO; j++) { deb_ts("urb no: %d, frame: %d, frame_offset: %d\n", i, j, frame_offset); urb->iso_frame_desc[j].offset = frame_offset; urb->iso_frame_desc[j].length = frame_size; frame_offset += frame_size; } if ((ret = usb_submit_urb(fc_usb->iso_urb[i],GFP_KERNEL))) { err("submitting urb %d failed with %d.", i, ret); goto urb_error; } deb_ts("submitted urb no. %d.\n", i); } /* SRAM */ flexcop_sram_set_dest(fc_usb->fc_dev, FC_SRAM_DEST_MEDIA | FC_SRAM_DEST_NET | FC_SRAM_DEST_CAO | FC_SRAM_DEST_CAI, FC_SRAM_DEST_TARGET_WAN_USB); flexcop_wan_set_speed(fc_usb->fc_dev, FC_WAN_SPEED_8MBITS); flexcop_sram_ctrl(fc_usb->fc_dev, 1, 1, 1); return 0; urb_error: flexcop_usb_transfer_exit(fc_usb); return ret; } static int flexcop_usb_init(struct flexcop_usb *fc_usb) { struct usb_host_interface *alt; int ret; /* use the alternate setting with the largest buffer */ ret = usb_set_interface(fc_usb->udev, 0, 1); if (ret) { err("set interface failed."); return ret; } alt = fc_usb->uintf->cur_altsetting; if (alt->desc.bNumEndpoints < 2) return -ENODEV; if (!usb_endpoint_is_isoc_in(&alt->endpoint[0].desc)) return -ENODEV; switch (fc_usb->udev->speed) { case USB_SPEED_LOW: err("cannot handle USB speed because it is too slow."); return -ENODEV; break; case USB_SPEED_FULL: info("running at FULL speed."); break; case USB_SPEED_HIGH: info("running at HIGH speed."); break; case USB_SPEED_SUPER: info("running at SUPER speed."); break; case USB_SPEED_SUPER_PLUS: info("running at SUPER+ speed."); break; case USB_SPEED_UNKNOWN: default: err("cannot handle USB speed because it is unknown."); return -ENODEV; } usb_set_intfdata(fc_usb->uintf, fc_usb); return 0; } static void flexcop_usb_exit(struct flexcop_usb *fc_usb) { usb_set_intfdata(fc_usb->uintf, NULL); } static int flexcop_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct flexcop_usb *fc_usb = NULL; struct flexcop_device *fc = NULL; int ret; if ((fc = flexcop_device_kmalloc(sizeof(struct flexcop_usb))) == NULL) { err("out of memory\n"); return -ENOMEM; } /* general flexcop init */ fc_usb = fc->bus_specific; fc_usb->fc_dev = fc; mutex_init(&fc_usb->data_mutex); fc->read_ibi_reg = flexcop_usb_read_ibi_reg; fc->write_ibi_reg = flexcop_usb_write_ibi_reg; fc->i2c_request = flexcop_usb_i2c_request; fc->get_mac_addr = flexcop_usb_get_mac_addr; fc->stream_control = flexcop_usb_stream_control; fc->pid_filtering = 1; fc->bus_type = FC_USB; fc->dev = &udev->dev; fc->owner = THIS_MODULE; /* bus specific part */ fc_usb->udev = udev; fc_usb->uintf = intf; if ((ret = flexcop_usb_init(fc_usb)) != 0) goto err_kfree; /* init flexcop */ if ((ret = flexcop_device_initialize(fc)) != 0) goto err_usb_exit; /* xfer init */ if ((ret = flexcop_usb_transfer_init(fc_usb)) != 0) goto err_fc_exit; info("%s successfully initialized and connected.", DRIVER_NAME); return 0; err_fc_exit: flexcop_device_exit(fc); err_usb_exit: flexcop_usb_exit(fc_usb); err_kfree: flexcop_device_kfree(fc); return ret; } static void flexcop_usb_disconnect(struct usb_interface *intf) { struct flexcop_usb *fc_usb = usb_get_intfdata(intf); flexcop_usb_transfer_exit(fc_usb); flexcop_device_exit(fc_usb->fc_dev); flexcop_usb_exit(fc_usb); flexcop_device_kfree(fc_usb->fc_dev); info("%s successfully deinitialized and disconnected.", DRIVER_NAME); } static const struct usb_device_id flexcop_usb_table[] = { { USB_DEVICE(0x0af7, 0x0101) }, { } }; MODULE_DEVICE_TABLE (usb, flexcop_usb_table); /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver flexcop_usb_driver = { .name = "b2c2_flexcop_usb", .probe = flexcop_usb_probe, .disconnect = flexcop_usb_disconnect, .id_table = flexcop_usb_table, }; module_usb_driver(flexcop_usb_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_NAME); MODULE_LICENSE("GPL"); |
| 30 30 16 14 29 16 16 16 1 15 1 15 16 1 15 16 16 16 16 16 16 16 1 17 1 29 29 29 16 16 16 16 16 16 16 16 16 16 16 16 1 14 1 1 1 14 16 16 1 14 16 16 16 13 13 1 11 1 1 13 13 9 4 4 9 13 9 4 4 4 13 4 9 13 1 12 13 13 13 12 1 12 1 12 9 4 1 1 8 1 7 4 3 6 4 4 8 3 8 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/array.c * * Copyright (C) 1992 by Linus Torvalds * based on ideas by Darren Senn * * Fixes: * Michael. K. Johnson: stat,statm extensions. * <johnsonm@stolaf.edu> * * Pauline Middelink : Made cmdline,envline only break at '\0's, to * make sure SET_PROCTITLE works. Also removed * bad '!' which forced address recalculation for * EVERY character on the current page. * <middelin@polyware.iaf.nl> * * Danny ter Haar : added cpuinfo * <dth@cistron.nl> * * Alessandro Rubini : profile extension. * <rubini@ipvvis.unipv.it> * * Jeff Tranter : added BogoMips field to cpuinfo * <Jeff_Tranter@Mitel.COM> * * Bruno Haible : remove 4K limit for the maps file * <haible@ma2s2.mathematik.uni-karlsruhe.de> * * Yves Arrouye : remove removal of trailing spaces in get_array. * <Yves.Arrouye@marin.fdn.fr> * * Jerome Forissier : added per-CPU time information to /proc/stat * and /proc/<pid>/cpu extension * <forissier@isia.cma.fr> * - Incorporation and non-SMP safe operation * of forissier patch in 2.1.78 by * Hans Marcus <crowbar@concepts.nl> * * aeb@cwi.nl : /proc/partitions * * * Alan Cox : security fixes. * <alan@lxorguk.ukuu.org.uk> * * Al Viro : safe handling of mm_struct * * Gerhard Wichert : added BIGMEM support * Siemens AG <Gerhard.Wichert@pdb.siemens.de> * * Al Viro & Jeff Garzik : moved most of the thing into base.c and * : proc_misc.c. The rest may eventually go into * : base.c too. */ #include <linux/types.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task_stack.h> #include <linux/sched/task.h> #include <linux/sched/cputime.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/smp.h> #include <linux/signal.h> #include <linux/highmem.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/times.h> #include <linux/cpuset.h> #include <linux/rcupdate.h> #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> #include <linux/prctl.h> #include <linux/ptrace.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/kthread.h> #include <linux/mmu_context.h> #include <asm/processor.h> #include "internal.h" void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char tcomm[64]; /* * Test before PF_KTHREAD because all workqueue worker threads are * kernel threads. */ if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else if (p->flags & PF_KTHREAD) get_kthread_comm(tcomm, sizeof(tcomm), p); else get_task_comm(tcomm, p); if (escape) seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); else seq_printf(m, "%.64s", tcomm); } /* * The task state array is a strange "bitmap" of * reasons to sleep. Thus "running" is zero, and * you can test for combinations of others with * simple bit tests. */ static const char * const task_state_array[] = { /* states in TASK_REPORT: */ "R (running)", /* 0x00 */ "S (sleeping)", /* 0x01 */ "D (disk sleep)", /* 0x02 */ "T (stopped)", /* 0x04 */ "t (tracing stop)", /* 0x08 */ "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ "P (parked)", /* 0x40 */ /* states beyond TASK_REPORT: */ "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) { BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); return task_state_array[task_state_index(tsk)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); task_lock(p); if (p->fs) umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); if (umask >= 0) seq_printf(m, "Umask:\t%#04o\n", umask); seq_puts(m, "State:\t"); seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns)); seq_put_decimal_ull(m, "\nPPid:\t", ppid); seq_put_decimal_ull(m, "\nTracerPid:\t", tpid); seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid)); seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid)); seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid)); seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid)); seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); seq_puts(m, "\nGroups:\t"); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) seq_put_decimal_ull(m, g ? " " : "", from_kgid_munged(user_ns, group_info->gid[g])); put_cred(cred); /* Trailing space shouldn't have been added in the first place. */ seq_putc(m, ' '); #ifdef CONFIG_PID_NS seq_puts(m, "\nNStgid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpgid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSsid:"); for (g = ns->level; g <= pid->level; g++) seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0'); } void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set) { int i; seq_puts(m, header); i = _NSIG; do { int x = 0; i -= 4; if (sigismember(set, i+1)) x |= 1; if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; seq_putc(m, hex_asc[x]); } while (i >= 4); seq_putc(m, '\n'); } static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, sigset_t *sigcatch) { struct k_sigaction *k; int i; k = p->sighand->action; for (i = 1; i <= _NSIG; ++i, ++k) { if (k->sa.sa_handler == SIG_IGN) sigaddset(sigign, i); else if (k->sa.sa_handler != SIG_DFL) sigaddset(sigcatch, i); } } static inline void task_sig(struct seq_file *m, struct task_struct *p) { unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); sigemptyset(&shpending); sigemptyset(&blocked); sigemptyset(&ignored); sigemptyset(&caught); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); } seq_put_decimal_ull(m, "Threads:\t", num_threads); seq_put_decimal_ull(m, "\nSigQ:\t", qsize); seq_put_decimal_ull(m, "/", qlim); /* render them all */ render_sigset_t(m, "\nSigPnd:\t", &pending); render_sigset_t(m, "ShdPnd:\t", &shpending); render_sigset_t(m, "SigBlk:\t", &blocked); render_sigset_t(m, "SigIgn:\t", &ignored); render_sigset_t(m, "SigCgt:\t", &caught); } static void render_cap_t(struct seq_file *m, const char *header, kernel_cap_t *a) { seq_puts(m, header); seq_put_hex_ll(m, NULL, a->val, 16); seq_putc(m, '\n'); } static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset, cap_ambient; rcu_read_lock(); cred = __task_cred(p); cap_inheritable = cred->cap_inheritable; cap_permitted = cred->cap_permitted; cap_effective = cred->cap_effective; cap_bset = cred->cap_bset; cap_ambient = cred->cap_ambient; rcu_read_unlock(); render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); render_cap_t(m, "CapAmb:\t", &cap_ambient); } static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #ifdef CONFIG_SECCOMP_FILTER seq_put_decimal_ull(m, "\nSeccomp_filters:\t", atomic_read(&p->seccomp.filter_count)); #endif #endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: seq_puts(m, "unknown"); break; case PR_SPEC_NOT_AFFECTED: seq_puts(m, "not vulnerable"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: seq_puts(m, "thread force mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: seq_puts(m, "thread mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: seq_puts(m, "thread vulnerable"); break; case PR_SPEC_DISABLE: seq_puts(m, "globally mitigated"); break; default: seq_puts(m, "vulnerable"); break; } seq_puts(m, "\nSpeculationIndirectBranch:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) { case -EINVAL: seq_puts(m, "unsupported"); break; case PR_SPEC_NOT_AFFECTED: seq_puts(m, "not affected"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: seq_puts(m, "conditional force disabled"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: seq_puts(m, "conditional disabled"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: seq_puts(m, "conditional enabled"); break; case PR_SPEC_ENABLE: seq_puts(m, "always enabled"); break; case PR_SPEC_DISABLE: seq_puts(m, "always disabled"); break; default: seq_puts(m, "unknown"); break; } seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw); seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw); seq_putc(m, '\n'); } static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", cpumask_pr_args(&task->cpus_mask)); } static inline void task_core_dumping(struct seq_file *m, struct task_struct *task) { seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state); seq_putc(m, '\n'); } static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) { bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); } __weak void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) { } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); seq_puts(m, "Name:\t"); proc_task_name(m, task, true); seq_putc(m, '\n'); task_state(m, ns, pid, task); if (mm) { task_mem(m, mm); task_core_dumping(m, task); task_thp_status(m, mm); task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); task_cap(m, task); task_seccomp(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); arch_proc_pid_thread_features(m, task); return 0; } static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { unsigned long vsize, eip, esp, wchan = 0; int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; char state; pid_t ppid = 0, pgid = -1, sid = -1; int num_threads = 0; int permitted; struct mm_struct *mm; unsigned long long start_time; unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt; u64 cutime, cstime, cgtime, utime, stime, gtime; unsigned long rsslim = 0; unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); /* * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). * * The only exception is if the task is core dumping because * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); put_task_stack(task); } } } sigemptyset(&sigign); sigemptyset(&sigcatch); if (lock_task_sighand(task, &flags)) { if (sig->tty) { struct pid *pgrp = tty_get_pgrp(sig->tty); tty_pgrp = pid_nr_ns(pgrp, ns); put_pid(pgrp); tty_nr = new_encode_dev(tty_devnum(sig->tty)); } num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); if (whole) { if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) exit_code = sig->group_exit_code; } sid = task_session_nr_ns(task, ns); ppid = task_tgid_nr_ns(task->real_parent, ns); pgid = task_pgrp_nr_ns(task, ns); unlock_task_sighand(task, &flags); } if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); do { seq++; /* 2 on the 1st/lockless path, otherwise odd */ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); cmin_flt = sig->cmin_flt; cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; if (whole) { struct task_struct *t; min_flt = sig->min_flt; maj_flt = sig->maj_flt; gtime = sig->gtime; rcu_read_lock(); __for_each_thread(sig, t) { min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); } rcu_read_unlock(); } } while (need_seqretry(&sig->stats_lock, seq)); done_seqretry_irqrestore(&sig->stats_lock, seq, flags); if (whole) { thread_group_cputime_adjusted(task, &utime, &stime); } else { task_cputime_adjusted(task, &utime, &stime); min_flt = task->min_flt; maj_flt = task->maj_flt; gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ priority = task_prio(task); nice = task_nice(task); /* apply timens offset for boottime and convert nsec -> ticks */ start_time = nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime)); seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); proc_task_name(m, task, false); seq_puts(m, ") "); seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); seq_put_decimal_ll(m, " ", tty_nr); seq_put_decimal_ll(m, " ", tty_pgrp); seq_put_decimal_ull(m, " ", task->flags); seq_put_decimal_ull(m, " ", min_flt); seq_put_decimal_ull(m, " ", cmin_flt); seq_put_decimal_ull(m, " ", maj_flt); seq_put_decimal_ull(m, " ", cmaj_flt); seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime)); seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime)); seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime)); seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime)); seq_put_decimal_ll(m, " ", priority); seq_put_decimal_ll(m, " ", nice); seq_put_decimal_ll(m, " ", num_threads); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", start_time); seq_put_decimal_ull(m, " ", vsize); seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0); seq_put_decimal_ull(m, " ", rsslim); seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0); seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0); seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0); seq_put_decimal_ull(m, " ", esp); seq_put_decimal_ull(m, " ", eip); /* The signal information here is obsolete. * It must be decimal for Linux 2.0 compatibility. * Use /proc/#/status for real-time signals. */ seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL); /* * We used to output the absolute kernel address, but that's an * information leak - so instead we show a 0/1 flag here, to signal * to user-space whether there's a wchan field in /proc/PID/wchan. * * This works with older implementations of procps as well. */ seq_put_decimal_ull(m, " ", wchan); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ll(m, " ", task->exit_signal); seq_put_decimal_ll(m, " ", task_cpu(task)); seq_put_decimal_ull(m, " ", task->rt_priority); seq_put_decimal_ull(m, " ", task->policy); seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime)); seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime)); if (mm && permitted) { seq_put_decimal_ull(m, " ", mm->start_data); seq_put_decimal_ull(m, " ", mm->end_data); seq_put_decimal_ull(m, " ", mm->start_brk); seq_put_decimal_ull(m, " ", mm->arg_start); seq_put_decimal_ull(m, " ", mm->arg_end); seq_put_decimal_ull(m, " ", mm->env_start); seq_put_decimal_ull(m, " ", mm->env_end); } else seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) seq_put_decimal_ll(m, " ", exit_code); else seq_puts(m, " 0"); seq_putc(m, '\n'); if (mm) mmput(mm); return 0; } int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_task_stat(m, ns, pid, task, 0); } int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_task_stat(m, ns, pid, task, 1); } int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); if (mm) { unsigned long size; unsigned long resident = 0; unsigned long shared = 0; unsigned long text = 0; unsigned long data = 0; size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); /* * For quick read, open code by putting numbers directly * expected format is * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", * size, resident, shared, text, data); */ seq_put_decimal_ull(m, "", size); seq_put_decimal_ull(m, " ", resident); seq_put_decimal_ull(m, " ", shared); seq_put_decimal_ull(m, " ", text); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", data); seq_put_decimal_ull(m, " ", 0); seq_putc(m, '\n'); } else { seq_write(m, "0 0 0 0 0 0 0\n", 14); } return 0; } #ifdef CONFIG_PROC_CHILDREN static struct pid * get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) { struct task_struct *start, *task; struct pid *pid = NULL; read_lock(&tasklist_lock); start = pid_task(proc_pid(inode), PIDTYPE_PID); if (!start) goto out; /* * Lets try to continue searching first, this gives * us significant speedup on children-rich processes. */ if (pid_prev) { task = pid_task(pid_prev, PIDTYPE_PID); if (task && task->real_parent == start && !(list_empty(&task->sibling))) { if (list_is_last(&task->sibling, &start->children)) goto out; task = list_first_entry(&task->sibling, struct task_struct, sibling); pid = get_pid(task_pid(task)); goto out; } } /* * Slow search case. * * We might miss some children here if children * are exited while we were not holding the lock, * but it was never promised to be accurate that * much. * * "Just suppose that the parent sleeps, but N children * exit after we printed their tids. Now the slow paths * skips N extra children, we miss N tasks." (c) * * So one need to stop or freeze the leader and all * its children to get a precise result. */ list_for_each_entry(task, &start->children, sibling) { if (pos-- == 0) { pid = get_pid(task_pid(task)); break; } } out: read_unlock(&tasklist_lock); return pid; } static int children_seq_show(struct seq_file *seq, void *v) { struct inode *inode = file_inode(seq->file); seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) { return get_children_pid(file_inode(seq->file), NULL, *pos); } static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pid *pid; pid = get_children_pid(file_inode(seq->file), v, *pos + 1); put_pid(v); ++*pos; return pid; } static void children_seq_stop(struct seq_file *seq, void *v) { put_pid(v); } static const struct seq_operations children_seq_ops = { .start = children_seq_start, .next = children_seq_next, .stop = children_seq_stop, .show = children_seq_show, }; static int children_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &children_seq_ops); } const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ |
| 2121 2101 782 2103 2104 2103 6786 6784 5256 16 5256 5252 5257 67 3 1850 3 1848 1916 6784 1891 5073 2429 5263 1918 3583 3590 3588 1859 6848 6600 4290 6846 1700 1 509 2429 6755 5362 6756 6370 484 2426 6750 80 5239 5845 3590 5536 2103 6442 6782 6782 6790 1 3 5 1 4 7 202 202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 | // SPDX-License-Identifier: GPL-2.0 /* * kernel userspace event delivery * * Copyright (C) 2004 Red Hat, Inc. All rights reserved. * Copyright (C) 2004 Novell, Inc. All rights reserved. * Copyright (C) 2004 IBM, Inc. All rights reserved. * * Authors: * Robert Love <rml@novell.com> * Kay Sievers <kay.sievers@vrfy.org> * Arjan van de Ven <arjanv@redhat.com> * Greg Kroah-Hartman <greg@kroah.com> */ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/kobject.h> #include <linux/export.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/uidgid.h> #include <linux/uuid.h> #include <linux/ctype.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> atomic64_t uevent_seqnum; #ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; #endif struct uevent_sock { struct list_head list; struct sock *sk; }; #ifdef CONFIG_NET static LIST_HEAD(uevent_sock_list); /* This lock protects uevent_sock_list */ static DEFINE_MUTEX(uevent_sock_mutex); #endif /* the strings here must match the enum in include/linux/kobject.h */ static const char *kobject_actions[] = { [KOBJ_ADD] = "add", [KOBJ_REMOVE] = "remove", [KOBJ_CHANGE] = "change", [KOBJ_MOVE] = "move", [KOBJ_ONLINE] = "online", [KOBJ_OFFLINE] = "offline", [KOBJ_BIND] = "bind", [KOBJ_UNBIND] = "unbind", }; static int kobject_action_type(const char *buf, size_t count, enum kobject_action *type, const char **args) { enum kobject_action action; size_t count_first; const char *args_start; int ret = -EINVAL; if (count && (buf[count-1] == '\n' || buf[count-1] == '\0')) count--; if (!count) goto out; args_start = strnchr(buf, count, ' '); if (args_start) { count_first = args_start - buf; args_start = args_start + 1; } else count_first = count; for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) { if (strncmp(kobject_actions[action], buf, count_first) != 0) continue; if (kobject_actions[action][count_first] != '\0') continue; if (args) *args = args_start; *type = action; ret = 0; break; } out: return ret; } static const char *action_arg_word_end(const char *buf, const char *buf_end, char delim) { const char *next = buf; while (next <= buf_end && *next != delim) if (!isalnum(*next++)) return NULL; if (next == buf) return NULL; return next; } static int kobject_action_args(const char *buf, size_t count, struct kobj_uevent_env **ret_env) { struct kobj_uevent_env *env = NULL; const char *next, *buf_end, *key; int key_len; int r = -EINVAL; if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0')) count--; if (!count) return -EINVAL; env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return -ENOMEM; /* first arg is UUID */ if (count < UUID_STRING_LEN || !uuid_is_valid(buf) || add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf)) goto out; /* * the rest are custom environment variables in KEY=VALUE * format with ' ' delimiter between each KEY=VALUE pair */ next = buf + UUID_STRING_LEN; buf_end = buf + count - 1; while (next <= buf_end) { if (*next != ' ') goto out; /* skip the ' ', key must follow */ key = ++next; if (key > buf_end) goto out; buf = next; next = action_arg_word_end(buf, buf_end, '='); if (!next || next > buf_end || *next != '=') goto out; key_len = next - buf; /* skip the '=', value must follow */ if (++next > buf_end) goto out; buf = next; next = action_arg_word_end(buf, buf_end, ' '); if (!next) goto out; if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s", key_len, key, (int) (next - buf), buf)) goto out; } r = 0; out: if (r) kfree(env); else *ret_env = env; return r; } /** * kobject_synth_uevent - send synthetic uevent with arguments * * @kobj: struct kobject for which synthetic uevent is to be generated * @buf: buffer containing action type and action args, newline is ignored * @count: length of buffer * * Returns 0 if kobject_synthetic_uevent() is completed with success or the * corresponding error when it fails. */ int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count) { char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL }; enum kobject_action action; const char *action_args; struct kobj_uevent_env *env; const char *msg = NULL, *devpath; int r; r = kobject_action_type(buf, count, &action, &action_args); if (r) { msg = "unknown uevent action string"; goto out; } if (!action_args) { r = kobject_uevent_env(kobj, action, no_uuid_envp); goto out; } r = kobject_action_args(action_args, count - (action_args - buf), &env); if (r == -EINVAL) { msg = "incorrect uevent action arguments"; goto out; } if (r) goto out; r = kobject_uevent_env(kobj, action, env->envp); kfree(env); out: if (r) { devpath = kobject_get_path(kobj, GFP_KERNEL); pr_warn("synth uevent: %s: %s\n", devpath ?: "unknown device", msg ?: "failed to send uevent"); kfree(devpath); } return r; } #ifdef CONFIG_UEVENT_HELPER static int kobj_usermode_filter(struct kobject *kobj) { const struct kobj_ns_type_operations *ops; ops = kobj_ns_ops(kobj); if (ops) { const void *init_ns, *ns; ns = kobj->ktype->namespace(kobj); init_ns = ops->initial_ns(); return ns != init_ns; } return 0; } static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem) { int buffer_size = sizeof(env->buf) - env->buflen; int len; len = strscpy(&env->buf[env->buflen], subsystem, buffer_size); if (len < 0) { pr_warn("%s: insufficient buffer space (%u left) for %s\n", __func__, buffer_size, subsystem); return -ENOMEM; } env->argv[0] = uevent_helper; env->argv[1] = &env->buf[env->buflen]; env->argv[2] = NULL; env->buflen += len + 1; return 0; } static void cleanup_uevent_env(struct subprocess_info *info) { kfree(info->data); } #endif #ifdef CONFIG_NET static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct netlink_skb_parms *parms; struct sk_buff *skb = NULL; char *scratch; size_t len; /* allocate message with maximum possible size */ len = strlen(action_string) + strlen(devpath) + 2; skb = alloc_skb(len + env->buflen, GFP_KERNEL); if (!skb) return NULL; /* add header */ scratch = skb_put(skb, len); sprintf(scratch, "%s@%s", action_string, devpath); skb_put_data(skb, env->buf, env->buflen); parms = &NETLINK_CB(skb); parms->creds.uid = GLOBAL_ROOT_UID; parms->creds.gid = GLOBAL_ROOT_GID; parms->dst_group = 1; parms->portid = 0; return skb; } static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct sk_buff *skb = NULL; struct uevent_sock *ue_sk; int retval = 0; /* send netlink message */ mutex_lock(&uevent_sock_mutex); list_for_each_entry(ue_sk, &uevent_sock_list, list) { struct sock *uevent_sock = ue_sk->sk; if (!netlink_has_listeners(uevent_sock, 1)) continue; if (!skb) { retval = -ENOMEM; skb = alloc_uevent_skb(env, action_string, devpath); if (!skb) continue; } retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (retval == -ENOBUFS || retval == -ESRCH) retval = 0; } mutex_unlock(&uevent_sock_mutex); consume_skb(skb); return retval; } static int uevent_net_broadcast_tagged(struct sock *usk, struct kobj_uevent_env *env, const char *action_string, const char *devpath) { struct user_namespace *owning_user_ns = sock_net(usk)->user_ns; struct sk_buff *skb = NULL; int ret = 0; skb = alloc_uevent_skb(env, action_string, devpath); if (!skb) return -ENOMEM; /* fix credentials */ if (owning_user_ns != &init_user_ns) { struct netlink_skb_parms *parms = &NETLINK_CB(skb); kuid_t root_uid; kgid_t root_gid; /* fix uid */ root_uid = make_kuid(owning_user_ns, 0); if (uid_valid(root_uid)) parms->creds.uid = root_uid; /* fix gid */ root_gid = make_kgid(owning_user_ns, 0); if (gid_valid(root_gid)) parms->creds.gid = root_gid; } ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (ret == -ENOBUFS || ret == -ESRCH) ret = 0; return ret; } #endif static int kobject_uevent_net_broadcast(struct kobject *kobj, struct kobj_uevent_env *env, const char *action_string, const char *devpath) { int ret = 0; #ifdef CONFIG_NET const struct kobj_ns_type_operations *ops; const struct net *net = NULL; ops = kobj_ns_ops(kobj); if (!ops && kobj->kset) { struct kobject *ksobj = &kobj->kset->kobj; if (ksobj->parent != NULL) ops = kobj_ns_ops(ksobj->parent); } /* kobjects currently only carry network namespace tags and they * are the only tag relevant here since we want to decide which * network namespaces to broadcast the uevent into. */ if (ops && ops->netlink_ns && kobj->ktype->namespace) if (ops->type == KOBJ_NS_TYPE_NET) net = kobj->ktype->namespace(kobj); if (!net) ret = uevent_net_broadcast_untagged(env, action_string, devpath); else ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env, action_string, devpath); #endif return ret; } static void zap_modalias_env(struct kobj_uevent_env *env) { static const char modalias_prefix[] = "MODALIAS="; size_t len; int i, j; for (i = 0; i < env->envp_idx;) { if (strncmp(env->envp[i], modalias_prefix, sizeof(modalias_prefix) - 1)) { i++; continue; } len = strlen(env->envp[i]) + 1; if (i != env->envp_idx - 1) { /* @env->envp[] contains pointers to @env->buf[] * with @env->buflen chars, and we are removing * variable MODALIAS here pointed by @env->envp[i] * with length @len as shown below: * * 0 @env->buf[] @env->buflen * --------------------------------------------- * ^ ^ ^ ^ * | |-> @len <-| target block | * @env->envp[0] @env->envp[i] @env->envp[i + 1] * * so the "target block" indicated above is moved * backward by @len, and its right size is * @env->buflen - (@env->envp[i + 1] - @env->envp[0]). */ memmove(env->envp[i], env->envp[i + 1], env->buflen - (env->envp[i + 1] - env->envp[0])); for (j = i; j < env->envp_idx - 1; j++) env->envp[j] = env->envp[j + 1] - len; } env->envp_idx--; env->buflen -= len; } } /** * kobject_uevent_env - send an uevent with environmental data * * @kobj: struct kobject that the action is happening to * @action: action that is happening * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the * corresponding error when it fails. */ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp_ext[]) { struct kobj_uevent_env *env; const char *action_string = kobject_actions[action]; const char *devpath = NULL; const char *subsystem; struct kobject *top_kobj; struct kset *kset; const struct kset_uevent_ops *uevent_ops; int i = 0; int retval = 0; /* * Mark "remove" event done regardless of result, for some subsystems * do not want to re-trigger "remove" event via automatic cleanup. */ if (action == KOBJ_REMOVE) kobj->state_remove_uevent_sent = 1; pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); /* search the kset we belong to */ top_kobj = kobj; while (!top_kobj->kset && top_kobj->parent) top_kobj = top_kobj->parent; if (!top_kobj->kset) { pr_debug("kobject: '%s' (%p): %s: attempted to send uevent " "without kset!\n", kobject_name(kobj), kobj, __func__); return -EINVAL; } kset = top_kobj->kset; uevent_ops = kset->uevent_ops; /* skip the event, if uevent_suppress is set*/ if (kobj->uevent_suppress) { pr_debug("kobject: '%s' (%p): %s: uevent_suppress " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* skip the event, if the filter returns zero. */ if (uevent_ops && uevent_ops->filter) if (!uevent_ops->filter(kobj)) { pr_debug("kobject: '%s' (%p): %s: filter function " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* originating subsystem */ if (uevent_ops && uevent_ops->name) subsystem = uevent_ops->name(kobj); else subsystem = kobject_name(&kset->kobj); if (!subsystem) { pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the " "event to drop!\n", kobject_name(kobj), kobj, __func__); return 0; } /* environment buffer */ env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); if (!env) return -ENOMEM; /* complete object path */ devpath = kobject_get_path(kobj, GFP_KERNEL); if (!devpath) { retval = -ENOENT; goto exit; } /* default keys */ retval = add_uevent_var(env, "ACTION=%s", action_string); if (retval) goto exit; retval = add_uevent_var(env, "DEVPATH=%s", devpath); if (retval) goto exit; retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem); if (retval) goto exit; /* keys passed in from the caller */ if (envp_ext) { for (i = 0; envp_ext[i]; i++) { retval = add_uevent_var(env, "%s", envp_ext[i]); if (retval) goto exit; } } /* let the kset specific function add its stuff */ if (uevent_ops && uevent_ops->uevent) { retval = uevent_ops->uevent(kobj, env); if (retval) { pr_debug("kobject: '%s' (%p): %s: uevent() returned " "%d\n", kobject_name(kobj), kobj, __func__, retval); goto exit; } } switch (action) { case KOBJ_ADD: /* * Mark "add" event so we can make sure we deliver "remove" * event to userspace during automatic cleanup. If * the object did send an "add" event, "remove" will * automatically generated by the core, if not already done * by the caller. */ kobj->state_add_uevent_sent = 1; break; case KOBJ_UNBIND: zap_modalias_env(env); break; default: break; } /* we will send an event, so request a new sequence number */ retval = add_uevent_var(env, "SEQNUM=%llu", atomic64_inc_return(&uevent_seqnum)); if (retval) goto exit; retval = kobject_uevent_net_broadcast(kobj, env, action_string, devpath); #ifdef CONFIG_UEVENT_HELPER /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { struct subprocess_info *info; retval = add_uevent_var(env, "HOME=/"); if (retval) goto exit; retval = add_uevent_var(env, "PATH=/sbin:/bin:/usr/sbin:/usr/bin"); if (retval) goto exit; retval = init_uevent_argv(env, subsystem); if (retval) goto exit; retval = -ENOMEM; info = call_usermodehelper_setup(env->argv[0], env->argv, env->envp, GFP_KERNEL, NULL, cleanup_uevent_env, env); if (info) { retval = call_usermodehelper_exec(info, UMH_NO_WAIT); env = NULL; /* freed by cleanup_uevent_env */ } } #endif exit: kfree(devpath); kfree(env); return retval; } EXPORT_SYMBOL_GPL(kobject_uevent_env); /** * kobject_uevent - notify userspace by sending an uevent * * @kobj: struct kobject that the action is happening to * @action: action that is happening * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. */ int kobject_uevent(struct kobject *kobj, enum kobject_action action) { return kobject_uevent_env(kobj, action, NULL); } EXPORT_SYMBOL_GPL(kobject_uevent); /** * add_uevent_var - add key value string to the environment buffer * @env: environment buffer structure * @format: printf format for the key=value pair * * Returns 0 if environment variable was added successfully or -ENOMEM * if no space was available. */ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) { va_list args; int len; if (env->envp_idx >= ARRAY_SIZE(env->envp)) { WARN(1, KERN_ERR "add_uevent_var: too many keys\n"); return -ENOMEM; } va_start(args, format); len = vsnprintf(&env->buf[env->buflen], sizeof(env->buf) - env->buflen, format, args); va_end(args); if (len >= (sizeof(env->buf) - env->buflen)) { WARN(1, KERN_ERR "add_uevent_var: buffer size too small\n"); return -ENOMEM; } env->envp[env->envp_idx++] = &env->buf[env->buflen]; env->buflen += len + 1; return 0; } EXPORT_SYMBOL_GPL(add_uevent_var); #if defined(CONFIG_NET) static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb, struct netlink_ext_ack *extack) { /* u64 to chars: 2^64 - 1 = 21 chars */ char buf[sizeof("SEQNUM=") + 21]; struct sk_buff *skbc; int ret; /* bump and prepare sequence number */ ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", atomic64_inc_return(&uevent_seqnum)); if (ret < 0 || (size_t)ret >= sizeof(buf)) return -ENOMEM; ret++; /* verify message does not overflow */ if ((skb->len + ret) > UEVENT_BUFFER_SIZE) { NL_SET_ERR_MSG(extack, "uevent message too big"); return -EINVAL; } /* copy skb and extend to accommodate sequence number */ skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL); if (!skbc) return -ENOMEM; /* append sequence number */ skb_put_data(skbc, buf, ret); /* remove msg header */ skb_pull(skbc, NLMSG_HDRLEN); /* set portid 0 to inform userspace message comes from kernel */ NETLINK_CB(skbc).portid = 0; NETLINK_CB(skbc).dst_group = 1; ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL); /* ENOBUFS should be handled in userspace */ if (ret == -ENOBUFS || ret == -ESRCH) ret = 0; return ret; } static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net; int ret; if (!nlmsg_data(nlh)) return -EINVAL; /* * Verify that we are allowed to send messages to the target * network namespace. The caller must have CAP_SYS_ADMIN in the * owning user namespace of the target network namespace. */ net = sock_net(NETLINK_CB(skb).sk); if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) { NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability"); return -EPERM; } ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack); return ret; } static void uevent_net_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &uevent_net_rcv_skb); } static int uevent_net_init(struct net *net) { struct uevent_sock *ue_sk; struct netlink_kernel_cfg cfg = { .groups = 1, .input = uevent_net_rcv, .flags = NL_CFG_F_NONROOT_RECV }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); if (!ue_sk) return -ENOMEM; ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg); if (!ue_sk->sk) { pr_err("kobject_uevent: unable to create netlink socket!\n"); kfree(ue_sk); return -ENODEV; } net->uevent_sock = ue_sk; /* Restrict uevents to initial user namespace. */ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) { mutex_lock(&uevent_sock_mutex); list_add_tail(&ue_sk->list, &uevent_sock_list); mutex_unlock(&uevent_sock_mutex); } return 0; } static void uevent_net_exit(struct net *net) { struct uevent_sock *ue_sk = net->uevent_sock; if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) { mutex_lock(&uevent_sock_mutex); list_del(&ue_sk->list); mutex_unlock(&uevent_sock_mutex); } netlink_kernel_release(ue_sk->sk); kfree(ue_sk); } static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, }; static int __init kobject_uevent_init(void) { return register_pernet_subsys(&uevent_net_ops); } postcore_initcall(kobject_uevent_init); #endif #ifdef CONFIG_UEVENT_HELPER static const struct ctl_table uevent_helper_sysctl_table[] = { { .procname = "hotplug", .data = &uevent_helper, .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, .proc_handler = proc_dostring, }, }; static int __init init_uevent_helper_sysctl(void) { register_sysctl_init("kernel", uevent_helper_sysctl_table); return 0; } postcore_initcall(init_uevent_helper_sysctl); #endif |
| 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Holtek On Line Grip based gamepads * * These include at least a Brazilian "Clone Joypad Super Power Fire" * which uses vendor ID 0x1241 and identifies as "HOLTEK On Line Grip". * * Copyright (c) 2011 Anssi Hannula <anssi.hannula@iki.fi> */ /* */ #include <linux/hid.h> #include <linux/input.h> #include <linux/module.h> #include <linux/slab.h> #include "hid-ids.h" #ifdef CONFIG_HOLTEK_FF /* * These commands and parameters are currently known: * * byte 0: command id: * 01 set effect parameters * 02 play specified effect * 03 stop specified effect * 04 stop all effects * 06 stop all effects * (the difference between 04 and 06 isn't known; win driver * sends 06,04 on application init, and 06 otherwise) * * Commands 01 and 02 need to be sent as pairs, i.e. you need to send 01 * before each 02. * * The rest of the bytes are parameters. Command 01 takes all of them, and * commands 02,03 take only the effect id. * * byte 1: * bits 0-3: effect id: * 1: very strong rumble * 2: periodic rumble, short intervals * 3: very strong rumble * 4: periodic rumble, long intervals * 5: weak periodic rumble, long intervals * 6: weak periodic rumble, short intervals * 7: periodic rumble, short intervals * 8: strong periodic rumble, short intervals * 9: very strong rumble * a: causes an error * b: very strong periodic rumble, very short intervals * c-f: nothing * bit 6: right (weak) motor enabled * bit 7: left (strong) motor enabled * * bytes 2-3: time in milliseconds, big-endian * bytes 5-6: unknown (win driver seems to use at least 10e0 with effect 1 * and 0014 with effect 6) * byte 7: * bits 0-3: effect magnitude */ #define HOLTEKFF_MSG_LENGTH 7 static const u8 start_effect_1[] = { 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const u8 stop_all4[] = { 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const u8 stop_all6[] = { 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; struct holtekff_device { struct hid_field *field; }; static void holtekff_send(struct holtekff_device *holtekff, struct hid_device *hid, const u8 data[HOLTEKFF_MSG_LENGTH]) { int i; for (i = 0; i < HOLTEKFF_MSG_LENGTH; i++) { holtekff->field->value[i] = data[i]; } dbg_hid("sending %7ph\n", data); hid_hw_request(hid, holtekff->field->report, HID_REQ_SET_REPORT); } static int holtekff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct holtekff_device *holtekff = data; int left, right; /* effect type 1, length 65535 msec */ u8 buf[HOLTEKFF_MSG_LENGTH] = { 0x01, 0x01, 0xff, 0xff, 0x10, 0xe0, 0x00 }; left = effect->u.rumble.strong_magnitude; right = effect->u.rumble.weak_magnitude; dbg_hid("called with 0x%04x 0x%04x\n", left, right); if (!left && !right) { holtekff_send(holtekff, hid, stop_all6); return 0; } if (left) buf[1] |= 0x80; if (right) buf[1] |= 0x40; /* The device takes a single magnitude, so we just sum them up. */ buf[6] = min(0xf, (left >> 12) + (right >> 12)); holtekff_send(holtekff, hid, buf); holtekff_send(holtekff, hid, start_effect_1); return 0; } static int holtekff_init(struct hid_device *hid) { struct holtekff_device *holtekff; struct hid_report *report; struct hid_input *hidinput; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct input_dev *dev; int error; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hid->inputs.next, struct hid_input, list); dev = hidinput->input; if (list_empty(report_list)) { hid_err(hid, "no output report found\n"); return -ENODEV; } report = list_entry(report_list->next, struct hid_report, list); if (report->maxfield < 1 || report->field[0]->report_count != 7) { hid_err(hid, "unexpected output report layout\n"); return -ENODEV; } holtekff = kzalloc(sizeof(*holtekff), GFP_KERNEL); if (!holtekff) return -ENOMEM; set_bit(FF_RUMBLE, dev->ffbit); holtekff->field = report->field[0]; /* initialize the same way as win driver does */ holtekff_send(holtekff, hid, stop_all4); holtekff_send(holtekff, hid, stop_all6); error = input_ff_create_memless(dev, holtekff, holtekff_play); if (error) { kfree(holtekff); return error; } hid_info(hid, "Force feedback for Holtek On Line Grip based devices by Anssi Hannula <anssi.hannula@iki.fi>\n"); return 0; } #else static inline int holtekff_init(struct hid_device *hid) { return 0; } #endif static int holtek_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } holtekff_init(hdev); return 0; err: return ret; } static const struct hid_device_id holtek_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_HOLTEK, USB_DEVICE_ID_HOLTEK_ON_LINE_GRIP) }, { } }; MODULE_DEVICE_TABLE(hid, holtek_devices); static struct hid_driver holtek_driver = { .name = "holtek", .id_table = holtek_devices, .probe = holtek_probe, }; module_hid_driver(holtek_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>"); MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices"); |
| 8 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | #ifndef _NET_FLOW_OFFLOAD_H #define _NET_FLOW_OFFLOAD_H #include <linux/kernel.h> #include <linux/list.h> #include <linux/netlink.h> #include <net/flow_dissector.h> struct flow_match { struct flow_dissector *dissector; void *mask; void *key; }; struct flow_match_meta { struct flow_dissector_key_meta *key, *mask; }; struct flow_match_basic { struct flow_dissector_key_basic *key, *mask; }; struct flow_match_control { struct flow_dissector_key_control *key, *mask; }; struct flow_match_eth_addrs { struct flow_dissector_key_eth_addrs *key, *mask; }; struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; struct flow_match_arp { struct flow_dissector_key_arp *key, *mask; }; struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; struct flow_match_ipv6_addrs { struct flow_dissector_key_ipv6_addrs *key, *mask; }; struct flow_match_ip { struct flow_dissector_key_ip *key, *mask; }; struct flow_match_ports { struct flow_dissector_key_ports *key, *mask; }; struct flow_match_ports_range { struct flow_dissector_key_ports_range *key, *mask; }; struct flow_match_icmp { struct flow_dissector_key_icmp *key, *mask; }; struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; struct flow_match_ipsec { struct flow_dissector_key_ipsec *key, *mask; }; struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; struct flow_match_enc_keyid { struct flow_dissector_key_keyid *key, *mask; }; struct flow_match_enc_opts { struct flow_dissector_key_enc_opts *key, *mask; }; struct flow_match_ct { struct flow_dissector_key_ct *key, *mask; }; struct flow_match_pppoe { struct flow_dissector_key_pppoe *key, *mask; }; struct flow_match_l2tpv3 { struct flow_dissector_key_l2tpv3 *key, *mask; }; struct flow_rule; void flow_rule_match_meta(const struct flow_rule *rule, struct flow_match_meta *out); void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out); void flow_rule_match_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_eth_addrs(const struct flow_rule *rule, struct flow_match_eth_addrs *out); void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_arp(const struct flow_rule *rule, struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); void flow_rule_match_ipsec(const struct flow_rule *rule, struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, struct flow_match_mpls *out); void flow_rule_match_enc_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_enc_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_enc_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_enc_keyid(const struct flow_rule *rule, struct flow_match_enc_keyid *out); void flow_rule_match_enc_opts(const struct flow_rule *rule, struct flow_match_enc_opts *out); void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out); void flow_rule_match_pppoe(const struct flow_rule *rule, struct flow_match_pppoe *out); void flow_rule_match_l2tpv3(const struct flow_rule *rule, struct flow_match_l2tpv3 *out); enum flow_action_id { FLOW_ACTION_ACCEPT = 0, FLOW_ACTION_DROP, FLOW_ACTION_TRAP, FLOW_ACTION_GOTO, FLOW_ACTION_REDIRECT, FLOW_ACTION_MIRRED, FLOW_ACTION_REDIRECT_INGRESS, FLOW_ACTION_MIRRED_INGRESS, FLOW_ACTION_VLAN_PUSH, FLOW_ACTION_VLAN_POP, FLOW_ACTION_VLAN_MANGLE, FLOW_ACTION_TUNNEL_ENCAP, FLOW_ACTION_TUNNEL_DECAP, FLOW_ACTION_MANGLE, FLOW_ACTION_ADD, FLOW_ACTION_CSUM, FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, FLOW_ACTION_POLICE, FLOW_ACTION_CT, FLOW_ACTION_CT_METADATA, FLOW_ACTION_MPLS_PUSH, FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, FLOW_ACTION_PPPOE_PUSH, FLOW_ACTION_JUMP, FLOW_ACTION_PIPE, FLOW_ACTION_VLAN_PUSH_ETH, FLOW_ACTION_VLAN_POP_ETH, FLOW_ACTION_CONTINUE, NUM_FLOW_ACTIONS, }; /* This is mirroring enum pedit_header_type definition for easy mapping between * tc pedit action. Legacy TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK is mapped to * FLOW_ACT_MANGLE_UNSPEC, which is supported by no driver. */ enum flow_action_mangle_base { FLOW_ACT_MANGLE_UNSPEC = 0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, FLOW_ACT_MANGLE_HDR_TYPE_IP4, FLOW_ACT_MANGLE_HDR_TYPE_IP6, FLOW_ACT_MANGLE_HDR_TYPE_TCP, FLOW_ACT_MANGLE_HDR_TYPE_UDP, }; enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE | FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); struct flow_action_cookie { u32 cookie_len; u8 cookie[]; }; struct flow_action_cookie *flow_action_cookie_create(void *data, unsigned int len, gfp_t gfp); void flow_action_cookie_destroy(struct flow_action_cookie *cookie); struct flow_action_entry { enum flow_action_id id; u32 hw_index; unsigned long cookie; u64 miss_cookie; enum flow_action_hw_stats hw_stats; action_destr destructor; void *destructor_priv; union { u32 chain_index; /* FLOW_ACTION_GOTO */ struct net_device *dev; /* FLOW_ACTION_REDIRECT */ struct { /* FLOW_ACTION_VLAN */ u16 vid; __be16 proto; u8 prio; } vlan; struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; u32 offset; u32 mask; u32 val; } mangle; struct ip_tunnel_info *tunnel; /* FLOW_ACTION_TUNNEL_ENCAP */ u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; u32 index; u8 vf; } queue; struct { /* FLOW_ACTION_SAMPLE */ struct psample_group *psample_group; u32 rate; u32 trunc_size; bool truncate; } sample; struct { /* FLOW_ACTION_POLICE */ u32 burst; u64 rate_bytes_ps; u64 peakrate_bytes_ps; u32 avrate; u16 overhead; u64 burst_pkt; u64 rate_pkt_ps; u32 mtu; struct { enum flow_action_id act_id; u32 extval; } exceed, notexceed; } police; struct { /* FLOW_ACTION_CT */ int action; u16 zone; struct nf_flowtable *flow_table; } ct; struct { unsigned long cookie; u32 mark; u32 labels[4]; bool orig_dir; } ct_metadata; struct { /* FLOW_ACTION_MPLS_PUSH */ u32 label; __be16 proto; u8 tc; u8 bos; u8 ttl; } mpls_push; struct { /* FLOW_ACTION_MPLS_POP */ __be16 proto; } mpls_pop; struct { /* FLOW_ACTION_MPLS_MANGLE */ u32 label; u8 tc; u8 bos; u8 ttl; } mpls_mangle; struct { s32 prio; u64 basetime; u64 cycletime; u64 cycletimeext; u32 num_entries; struct action_gate_entry *entries; } gate; struct { /* FLOW_ACTION_PPPOE_PUSH */ u16 sid; } pppoe; }; struct flow_action_cookie *user_cookie; /* user defined action cookie */ }; struct flow_action { unsigned int num_entries; struct flow_action_entry entries[] __counted_by(num_entries); }; static inline bool flow_action_has_entries(const struct flow_action *action) { return action->num_entries; } /** * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { return action->num_entries == 1; } static inline bool flow_action_is_last_entry(const struct flow_action *action, const struct flow_action_entry *entry) { return entry == &action->entries[action->num_entries - 1]; } #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ __act = &(__actions)->entries[++__i]) static inline bool flow_action_mixed_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { const struct flow_action_entry *action_entry; u8 last_hw_stats; int i; if (flow_offload_has_one_action(action)) return true; flow_action_for_each(i, action_entry, action) { if (i && action_entry->hw_stats != last_hw_stats) { NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); return false; } last_hw_stats = action_entry->hw_stats; } return true; } static inline const struct flow_action_entry * flow_action_first_entry_get(const struct flow_action *action) { WARN_ON(!flow_action_has_entries(action)); return &action->entries[0]; } static inline bool __flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, bool check_allow_bit, enum flow_action_hw_stats_bit allow_bit) { const struct flow_action_entry *action_entry; if (!flow_action_has_entries(action)) return true; if (!flow_action_mixed_hw_stats_check(action, extack)) return false; action_entry = flow_action_first_entry_get(action); /* Zero is not a legal value for hw_stats, catch anyone passing it */ WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && !(action_entry->hw_stats & BIT(allow_bit))) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support selected HW stats type"); return false; } return true; } static inline bool flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, enum flow_action_hw_stats_bit allow_bit) { return __flow_action_hw_stats_check(action, extack, true, allow_bit); } static inline bool flow_action_basic_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { return __flow_action_hw_stats_check(action, extack, false, 0); } struct flow_rule { struct flow_match match; struct flow_action action; }; struct flow_rule *flow_rule_alloc(unsigned int num_actions); static inline bool flow_rule_match_key(const struct flow_rule *rule, enum flow_dissector_key_id key) { return dissector_uses_key(rule->match.dissector, key); } /** * flow_rule_is_supp_control_flags() - check for supported control flags * @supp_flags: control flags supported by driver * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, const u32 ctrl_flags, struct netlink_ext_ack *extack) { if (likely((ctrl_flags & ~supp_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on control.flags %#x", ctrl_flags); return false; } /** * flow_rule_is_supp_enc_control_flags() - check for supported control flags * @supp_enc_flags: encapsulation control flags supported by driver * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_enc_control_flags(const u32 supp_enc_flags, const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { if (likely((enc_ctrl_flags & ~supp_enc_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on enc_control.flags %#x", enc_ctrl_flags); return false; } /** * flow_rule_has_control_flags() - check for presence of any control flags * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); } /** * flow_rule_has_enc_control_flags() - check for presence of any control flags * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_enc_control_flags(0, enc_ctrl_flags, extack); } /** * flow_rule_match_has_control_flags() - match and check for any control flags * @rule: The flow_rule under evaluation. * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, struct netlink_ext_ack *extack) { struct flow_match_control match; if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) return false; flow_rule_match_control(rule, &match); return flow_rule_has_control_flags(match.mask->flags, extack); } struct flow_stats { u64 pkts; u64 bytes; u64 drops; u64 lastused; enum flow_action_hw_stats used_hw_stats; bool used_hw_stats_valid; }; static inline void flow_stats_update(struct flow_stats *flow_stats, u64 bytes, u64 pkts, u64 drops, u64 lastused, enum flow_action_hw_stats used_hw_stats) { flow_stats->pkts += pkts; flow_stats->bytes += bytes; flow_stats->drops += drops; flow_stats->lastused = max_t(u64, flow_stats->lastused, lastused); /* The driver should pass value with a maximum of one bit set. * Passing FLOW_ACTION_HW_STATS_ANY is invalid. */ WARN_ON(used_hw_stats == FLOW_ACTION_HW_STATS_ANY); flow_stats->used_hw_stats |= used_hw_stats; flow_stats->used_hw_stats_valid = true; } enum flow_block_command { FLOW_BLOCK_BIND, FLOW_BLOCK_UNBIND, }; enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { struct list_head cb_list; }; struct netlink_ext_ack; struct flow_block_offload { enum flow_block_command command; enum flow_block_binder_type binder_type; bool block_shared; bool unlocked_driver_cb; struct net *net; struct flow_block *block; struct list_head cb_list; struct list_head *driver_block_list; struct netlink_ext_ack *extack; struct Qdisc *sch; struct list_head *cb_list_head; }; enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); struct flow_block_cb; struct flow_block_indr { struct list_head list; struct net_device *dev; struct Qdisc *sch; enum flow_block_binder_type binder_type; void *data; void *cb_priv; void (*cleanup)(struct flow_block_cb *block_cb); }; struct flow_block_cb { struct list_head driver_list; struct list_head list; flow_setup_cb_t *cb; void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); struct flow_block_indr indr; unsigned int refcnt; }; struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)); void flow_block_cb_free(struct flow_block_cb *block_cb); struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident); void *flow_block_cb_priv(struct flow_block_cb *block_cb); void flow_block_cb_incref(struct flow_block_cb *block_cb); unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb); static inline void flow_block_cb_add(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_add_tail(&block_cb->list, &offload->cb_list); } static inline void flow_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_move(&block_cb->list, &offload->cb_list); } static inline void flow_indr_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_del(&block_cb->indr.list); list_move(&block_cb->list, &offload->cb_list); } bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list); int flow_block_cb_setup_simple(struct flow_block_offload *f, struct list_head *driver_list, flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, bool ingress_only); enum flow_cls_command { FLOW_CLS_REPLACE, FLOW_CLS_DESTROY, FLOW_CLS_STATS, FLOW_CLS_TMPLT_CREATE, FLOW_CLS_TMPLT_DESTROY, }; struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; bool skip_sw; struct netlink_ext_ack *extack; }; struct flow_cls_offload { struct flow_cls_common_offload common; enum flow_cls_command command; bool use_act_stats; unsigned long cookie; struct flow_rule *rule; struct flow_stats stats; u32 classid; }; enum offload_act_command { FLOW_ACT_REPLACE, FLOW_ACT_DESTROY, FLOW_ACT_STATS, }; struct flow_offload_action { struct netlink_ext_ack *extack; /* NULL in FLOW_ACT_STATS process*/ enum offload_act_command command; enum flow_action_id id; u32 index; unsigned long cookie; struct flow_stats stats; struct flow_action action; }; struct flow_offload_action *offload_action_alloc(unsigned int num_actions); static inline struct flow_rule * flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) { return flow_cmd->rule; } static inline void flow_block_init(struct flow_block *flow_block) { INIT_LIST_HEAD(&flow_block->cb_list); } typedef int flow_indr_block_bind_cb_t(struct net_device *dev, struct Qdisc *sch, void *cb_priv, enum tc_setup_type type, void *type_data, void *data, void (*cleanup)(struct flow_block_cb *block_cb)); int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, void (*release)(void *cb_priv)); int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_ESP_H #define _NET_ESP_H #include <linux/skbuff.h> struct ip_esp_hdr; struct xfrm_state; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) { return (struct ip_esp_hdr *)skb_transport_header(skb); } static inline void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) { /* Fill padding... */ if (tfclen) { memset(tail, 0, tfclen); tail += tfclen; } do { int i; for (i = 0; i < plen - 2; i++) tail[i] = i + 1; } while (0); tail[plen - 2] = plen - 2; tail[plen - 1] = proto; } struct esp_info { struct ip_esp_hdr *esph; __be64 seqno; int tfclen; int tailen; int plen; int clen; int len; int nfrags; __u8 proto; bool inplace; }; int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp_input_done2(struct sk_buff *skb, int err); int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp6_input_done2(struct sk_buff *skb, int err); #endif |
| 24 24 122 8 5 109 108 104 81 85 24 24 22 24 24 24 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/atari.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "atari.h" /* ++guenther: this should be settable by the user ("make config")?. */ #define ICD_PARTS /* check if a partition entry looks valid -- Atari format is assumed if at least one of the primary entries is ok this way */ #define VALID_PARTITION(pi,hdsiz) \ (((pi)->flg & 1) && \ isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ be32_to_cpu((pi)->st) <= (hdsiz) && \ be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) static inline int OK_id(char *s) { return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || memcmp (s, "RAW", 3) == 0 ; } int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; struct partition_info *pi; u32 extensect; u32 hd_size; int slot; #ifdef ICD_PARTS int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif /* * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); if (!rs) return -1; /* Verify this is an Atari rootsector: */ hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && !VALID_PARTITION(&rs->part[3], hd_size)) { /* * if there's no valid primary partition, assume that no Atari * format partition table (there's no reliable magic or the like * :-() */ put_dev_sector(sect); return 0; } pi = &rs->part[0]; strlcat(state->pp_buf, " AHDI", PAGE_SIZE); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; ulong partsect; if ( !(pi->flg & 1) ) continue; /* active partition */ if (memcmp (pi->id, "XGM", 3) != 0) { /* we don't care about other id's */ put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); continue; } /* extension partition */ #ifdef ICD_PARTS part_fmt = 1; #endif strlcat(state->pp_buf, " XGM<", PAGE_SIZE); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, §2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); return -1; } /* ++roman: sanity check: bit 0 of flg field must be set */ if (!(xrs->part[0].flg & 1)) { printk( "\nFirst sub-partition in extended partition is not valid!\n" ); put_dev_sector(sect2); break; } put_partition(state, slot, partsect + be32_to_cpu(xrs->part[0].st), be32_to_cpu(xrs->part[0].siz)); if (!(xrs->part[1].flg & 1)) { /* end of linked partition list */ put_dev_sector(sect2); break; } if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { printk("\nID of extended partition is not XGM!\n"); put_dev_sector(sect2); break; } partsect = be32_to_cpu(xrs->part[1].st) + extensect; put_dev_sector(sect2); if (++slot == state->limit) { printk( "\nMaximum number of partitions reached!\n" ); break; } } strlcat(state->pp_buf, " >", PAGE_SIZE); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { strlcat(state->pp_buf, " ICD<", PAGE_SIZE); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } strlcat(state->pp_buf, " >", PAGE_SIZE); } } #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } |
| 1 1 1 3 2 2 2 1 3 3 5 3 11 5 6 2 1 1 1 1 2 2 1 3 3 3 3 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 | // SPDX-License-Identifier: GPL-2.0-only /* * keyspan_remote: USB driver for the Keyspan DMR * * Copyright (C) 2005 Zymeta Corporation - Michael Downey (downey@zymeta.com) * * This driver has been put together with the support of Innosys, Inc. * and Keyspan, Inc the manufacturers of the Keyspan USB DMR product. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb/input.h> /* Parameters that can be passed to the driver. */ static int debug; module_param(debug, int, 0444); MODULE_PARM_DESC(debug, "Enable extra debug messages and information"); /* Vendor and product ids */ #define USB_KEYSPAN_VENDOR_ID 0x06CD #define USB_KEYSPAN_PRODUCT_UIA11 0x0202 /* Defines for converting the data from the remote. */ #define ZERO 0x18 #define ZERO_MASK 0x1F /* 5 bits for a 0 */ #define ONE 0x3C #define ONE_MASK 0x3F /* 6 bits for a 1 */ #define SYNC 0x3F80 #define SYNC_MASK 0x3FFF /* 14 bits for a SYNC sequence */ #define STOP 0x00 #define STOP_MASK 0x1F /* 5 bits for the STOP sequence */ #define GAP 0xFF #define RECV_SIZE 8 /* The UIA-11 type have a 8 byte limit. */ /* * Table that maps the 31 possible keycodes to input keys. * Currently there are 15 and 17 button models so RESERVED codes * are blank areas in the mapping. */ static const unsigned short keyspan_key_table[] = { KEY_RESERVED, /* 0 is just a place holder. */ KEY_RESERVED, KEY_STOP, KEY_PLAYCD, KEY_RESERVED, KEY_PREVIOUSSONG, KEY_REWIND, KEY_FORWARD, KEY_NEXTSONG, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_PAUSE, KEY_VOLUMEUP, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_VOLUMEDOWN, KEY_RESERVED, KEY_UP, KEY_RESERVED, KEY_MUTE, KEY_LEFT, KEY_ENTER, KEY_RIGHT, KEY_RESERVED, KEY_RESERVED, KEY_DOWN, KEY_RESERVED, KEY_KPASTERISK, KEY_RESERVED, KEY_MENU }; /* table of devices that work with this driver */ static const struct usb_device_id keyspan_table[] = { { USB_DEVICE(USB_KEYSPAN_VENDOR_ID, USB_KEYSPAN_PRODUCT_UIA11) }, { } /* Terminating entry */ }; /* Structure to store all the real stuff that a remote sends to us. */ struct keyspan_message { u16 system; u8 button; u8 toggle; }; /* Structure used for all the bit testing magic needed to be done. */ struct bit_tester { u32 tester; int len; int pos; int bits_left; u8 buffer[32]; }; /* Structure to hold all of our driver specific stuff */ struct usb_keyspan { char name[128]; char phys[64]; unsigned short keymap[ARRAY_SIZE(keyspan_key_table)]; struct usb_device *udev; struct input_dev *input; struct usb_interface *interface; struct usb_endpoint_descriptor *in_endpoint; struct urb* irq_urb; int open; dma_addr_t in_dma; unsigned char *in_buffer; /* variables used to parse messages from remote. */ struct bit_tester data; int stage; int toggle; }; static struct usb_driver keyspan_driver; /* * Debug routine that prints out what we've received from the remote. */ static void keyspan_print(struct usb_keyspan* dev) /*unsigned char* data)*/ { char codes[4 * RECV_SIZE]; int i; for (i = 0; i < RECV_SIZE; i++) snprintf(codes + i * 3, 4, "%02x ", dev->in_buffer[i]); dev_info(&dev->udev->dev, "%s\n", codes); } /* * Routine that manages the bit_tester structure. It makes sure that there are * at least bits_needed bits loaded into the tester. */ static int keyspan_load_tester(struct usb_keyspan* dev, int bits_needed) { if (dev->data.bits_left >= bits_needed) return 0; /* * Somehow we've missed the last message. The message will be repeated * though so it's not too big a deal */ if (dev->data.pos >= dev->data.len) { dev_dbg(&dev->interface->dev, "%s - Error ran out of data. pos: %d, len: %d\n", __func__, dev->data.pos, dev->data.len); return -1; } /* Load as much as we can into the tester. */ while ((dev->data.bits_left + 7 < (sizeof(dev->data.tester) * 8)) && (dev->data.pos < dev->data.len)) { dev->data.tester += (dev->data.buffer[dev->data.pos++] << dev->data.bits_left); dev->data.bits_left += 8; } return 0; } static void keyspan_report_button(struct usb_keyspan *remote, int button, int press) { struct input_dev *input = remote->input; input_event(input, EV_MSC, MSC_SCAN, button); input_report_key(input, remote->keymap[button], press); input_sync(input); } /* * Routine that handles all the logic needed to parse out the message from the remote. */ static void keyspan_check_data(struct usb_keyspan *remote) { int i; int found = 0; struct keyspan_message message; switch(remote->stage) { case 0: /* * In stage 0 we want to find the start of a message. The remote sends a 0xFF as filler. * So the first byte that isn't a FF should be the start of a new message. */ for (i = 0; i < RECV_SIZE && remote->in_buffer[i] == GAP; ++i); if (i < RECV_SIZE) { memcpy(remote->data.buffer, remote->in_buffer, RECV_SIZE); remote->data.len = RECV_SIZE; remote->data.pos = 0; remote->data.tester = 0; remote->data.bits_left = 0; remote->stage = 1; } break; case 1: /* * Stage 1 we should have 16 bytes and should be able to detect a * SYNC. The SYNC is 14 bits, 7 0's and then 7 1's. */ memcpy(remote->data.buffer + remote->data.len, remote->in_buffer, RECV_SIZE); remote->data.len += RECV_SIZE; found = 0; while ((remote->data.bits_left >= 14 || remote->data.pos < remote->data.len) && !found) { for (i = 0; i < 8; ++i) { if (keyspan_load_tester(remote, 14) != 0) { remote->stage = 0; return; } if ((remote->data.tester & SYNC_MASK) == SYNC) { remote->data.tester = remote->data.tester >> 14; remote->data.bits_left -= 14; found = 1; break; } else { remote->data.tester = remote->data.tester >> 1; --remote->data.bits_left; } } } if (!found) { remote->stage = 0; remote->data.len = 0; } else { remote->stage = 2; } break; case 2: /* * Stage 2 we should have 24 bytes which will be enough for a full * message. We need to parse out the system code, button code, * toggle code, and stop. */ memcpy(remote->data.buffer + remote->data.len, remote->in_buffer, RECV_SIZE); remote->data.len += RECV_SIZE; message.system = 0; for (i = 0; i < 9; i++) { keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.system = message.system << 1; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.system = (message.system << 1) + 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Unknown sequence found in system data.\n", __func__); remote->stage = 0; return; } } message.button = 0; for (i = 0; i < 5; i++) { keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.button = message.button << 1; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.button = (message.button << 1) + 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Unknown sequence found in button data.\n", __func__); remote->stage = 0; return; } } keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.toggle = 0; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.toggle = 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Error in message, invalid toggle.\n", __func__); remote->stage = 0; return; } keyspan_load_tester(remote, 5); if ((remote->data.tester & STOP_MASK) == STOP) { remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else { dev_err(&remote->interface->dev, "Bad message received, no stop bit found.\n"); } dev_dbg(&remote->interface->dev, "%s found valid message: system: %d, button: %d, toggle: %d\n", __func__, message.system, message.button, message.toggle); if (message.toggle != remote->toggle) { keyspan_report_button(remote, message.button, 1); keyspan_report_button(remote, message.button, 0); remote->toggle = message.toggle; } remote->stage = 0; break; } } /* * Routine for sending all the initialization messages to the remote. */ static int keyspan_setup(struct usb_device* dev) { int retval = 0; retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x11, 0x40, 0x5601, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set bit rate due to error: %d\n", __func__, retval); return(retval); } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x44, 0x40, 0x0, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set resume sensitivity due to error: %d\n", __func__, retval); return(retval); } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x22, 0x40, 0x0, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to turn receive on due to error: %d\n", __func__, retval); return(retval); } dev_dbg(&dev->dev, "%s - Setup complete.\n", __func__); return(retval); } /* * Routine used to handle a new message that has come in. */ static void keyspan_irq_recv(struct urb *urb) { struct usb_keyspan *dev = urb->context; int retval; /* Check our status in case we need to bail out early. */ switch (urb->status) { case 0: break; /* Device went away so don't keep trying to read from it. */ case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: return; default: goto resubmit; } if (debug) keyspan_print(dev); keyspan_check_data(dev); resubmit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&dev->interface->dev, "%s - usb_submit_urb failed with result: %d\n", __func__, retval); } static int keyspan_open(struct input_dev *dev) { struct usb_keyspan *remote = input_get_drvdata(dev); remote->irq_urb->dev = remote->udev; if (usb_submit_urb(remote->irq_urb, GFP_KERNEL)) return -EIO; return 0; } static void keyspan_close(struct input_dev *dev) { struct usb_keyspan *remote = input_get_drvdata(dev); usb_kill_urb(remote->irq_urb); } static struct usb_endpoint_descriptor *keyspan_get_in_endpoint(struct usb_host_interface *iface) { struct usb_endpoint_descriptor *endpoint; int i; for (i = 0; i < iface->desc.bNumEndpoints; ++i) { endpoint = &iface->endpoint[i].desc; if (usb_endpoint_is_int_in(endpoint)) { /* we found our interrupt in endpoint */ return endpoint; } } return NULL; } /* * Routine that sets up the driver to handle a specific USB device detected on the bus. */ static int keyspan_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct usb_endpoint_descriptor *endpoint; struct usb_keyspan *remote; struct input_dev *input_dev; int i, error; endpoint = keyspan_get_in_endpoint(interface->cur_altsetting); if (!endpoint) return -ENODEV; remote = kzalloc(sizeof(*remote), GFP_KERNEL); input_dev = input_allocate_device(); if (!remote || !input_dev) { error = -ENOMEM; goto fail1; } remote->udev = udev; remote->input = input_dev; remote->interface = interface; remote->in_endpoint = endpoint; remote->toggle = -1; /* Set to -1 so we will always not match the toggle from the first remote message. */ remote->in_buffer = usb_alloc_coherent(udev, RECV_SIZE, GFP_KERNEL, &remote->in_dma); if (!remote->in_buffer) { error = -ENOMEM; goto fail1; } remote->irq_urb = usb_alloc_urb(0, GFP_KERNEL); if (!remote->irq_urb) { error = -ENOMEM; goto fail2; } error = keyspan_setup(udev); if (error) { error = -ENODEV; goto fail3; } if (udev->manufacturer) strscpy(remote->name, udev->manufacturer, sizeof(remote->name)); if (udev->product) { if (udev->manufacturer) strlcat(remote->name, " ", sizeof(remote->name)); strlcat(remote->name, udev->product, sizeof(remote->name)); } if (!strlen(remote->name)) snprintf(remote->name, sizeof(remote->name), "USB Keyspan Remote %04x:%04x", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); usb_make_path(udev, remote->phys, sizeof(remote->phys)); strlcat(remote->phys, "/input0", sizeof(remote->phys)); memcpy(remote->keymap, keyspan_key_table, sizeof(remote->keymap)); input_dev->name = remote->name; input_dev->phys = remote->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &interface->dev; input_dev->keycode = remote->keymap; input_dev->keycodesize = sizeof(unsigned short); input_dev->keycodemax = ARRAY_SIZE(remote->keymap); input_set_capability(input_dev, EV_MSC, MSC_SCAN); __set_bit(EV_KEY, input_dev->evbit); for (i = 0; i < ARRAY_SIZE(keyspan_key_table); i++) __set_bit(keyspan_key_table[i], input_dev->keybit); __clear_bit(KEY_RESERVED, input_dev->keybit); input_set_drvdata(input_dev, remote); input_dev->open = keyspan_open; input_dev->close = keyspan_close; /* * Initialize the URB to access the device. * The urb gets sent to the device in keyspan_open() */ usb_fill_int_urb(remote->irq_urb, remote->udev, usb_rcvintpipe(remote->udev, endpoint->bEndpointAddress), remote->in_buffer, RECV_SIZE, keyspan_irq_recv, remote, endpoint->bInterval); remote->irq_urb->transfer_dma = remote->in_dma; remote->irq_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; /* we can register the device now, as it is ready */ error = input_register_device(remote->input); if (error) goto fail3; /* save our data pointer in this interface device */ usb_set_intfdata(interface, remote); return 0; fail3: usb_free_urb(remote->irq_urb); fail2: usb_free_coherent(udev, RECV_SIZE, remote->in_buffer, remote->in_dma); fail1: kfree(remote); input_free_device(input_dev); return error; } /* * Routine called when a device is disconnected from the USB. */ static void keyspan_disconnect(struct usb_interface *interface) { struct usb_keyspan *remote; remote = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); if (remote) { /* We have a valid driver structure so clean up everything we allocated. */ input_unregister_device(remote->input); usb_kill_urb(remote->irq_urb); usb_free_urb(remote->irq_urb); usb_free_coherent(remote->udev, RECV_SIZE, remote->in_buffer, remote->in_dma); kfree(remote); } } /* * Standard driver set up sections */ static struct usb_driver keyspan_driver = { .name = "keyspan_remote", .probe = keyspan_probe, .disconnect = keyspan_disconnect, .id_table = keyspan_table }; module_usb_driver(keyspan_driver); MODULE_DEVICE_TABLE(usb, keyspan_table); MODULE_AUTHOR("Michael Downey <downey@zymeta.com>"); MODULE_DESCRIPTION("Driver for the USB Keyspan remote control."); MODULE_LICENSE("GPL"); |
| 18334 106 1529 1315 164 151 466 20609 438 1067 18334 1530 466 151 20784 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I/O iterator iteration building functions. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_IOV_ITER_H #define _LINUX_IOV_ITER_H #include <linux/uio.h> #include <linux/bvec.h> #include <linux/folio_queue.h> typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len, void *priv, void *priv2); /* * Handle ITER_UBUF. */ static __always_inline size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { void __user *base = iter->ubuf; size_t progress = 0, remain; remain = step(base + iter->iov_offset, 0, len, priv, priv2); progress = len - remain; iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_IOVEC. */ static __always_inline size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { const struct iovec *p = iter->__iov; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->__iov; iter->__iov = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_KVEC. */ static __always_inline size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct kvec *p = iter->kvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->kvec; iter->kvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_BVEC. */ static __always_inline size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct bio_vec *p = iter->bvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t offset = p->bv_offset + skip, part; void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE); part = min3(len, (size_t)(p->bv_len - skip), (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2); kunmap_local(kaddr); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= p->bv_len) { skip = 0; p++; } if (remain) break; } while (len); iter->nr_segs -= p - iter->bvec; iter->bvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_FOLIOQ. */ static __always_inline size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct folio_queue *folioq = iter->folioq; unsigned int slot = iter->folioq_slot; size_t progress = 0, skip = iter->iov_offset; if (slot == folioq_nr_slots(folioq)) { /* The iterator may have been extended. */ folioq = folioq->next; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t part, remain = 0, consumed; size_t fsize; void *base; if (!folio) break; fsize = folioq_folio_size(folioq, slot); if (skip < fsize) { base = kmap_local_folio(folio, skip); part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; } if (skip >= fsize) { skip = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } if (remain) break; } while (len); iter->folioq_slot = slot; iter->folioq = folioq; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_XARRAY. */ static __always_inline size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { struct folio *folio; size_t progress = 0; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; XA_STATE(xas, iter->xarray, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { size_t remain, consumed, offset, part, flen; if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start + progress); flen = min(folio_size(folio) - offset, len); while (flen) { void *base = kmap_local_folio(folio, offset); part = min_t(size_t, flen, PAGE_SIZE - offset_in_page(offset)); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; progress += consumed; len -= consumed; if (remain || len == 0) goto out; flen -= consumed; offset += consumed; } } out: rcu_read_unlock(); iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_DISCARD. */ static __always_inline size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { size_t progress = len; iter->count -= progress; return progress; } /** * iterate_and_advance2 - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * Two step functions, @step and @ustep, must be provided, one for handling * mapped kernel addresses and the other is given user addresses which have the * potential to fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f ustep, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (likely(iter_is_ubuf(iter))) return iterate_ubuf(iter, len, priv, priv2, ustep); if (likely(iter_is_iovec(iter))) return iterate_iovec(iter, len, priv, priv2, ustep); if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } /** * iterate_and_advance - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * As iterate_and_advance2(), but priv2 is always NULL. */ static __always_inline size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, iov_ustep_f ustep, iov_step_f step) { return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } /** * iterate_and_advance_kernel - Iterate over a kernel-internal iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type * iterators; it will not handle UBUF or IOVEC-type iterators. * * A step functions, @step, must be provided, one for handling mapped kernel * addresses and the other is given user addresses which have the potential to * fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } #endif /* _LINUX_IOV_ITER_H */ |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 19 1 8 8 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_FPU_H_ #define __KVM_FPU_H_ #include <asm/fpu/api.h> typedef u32 __attribute__((vector_size(16))) sse128_t; #define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } #define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; }) #define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; }) #define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; }) #define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; }) #define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; }) #define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) #define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) static inline void _kvm_read_sse_reg(int reg, sse128_t *data) { switch (reg) { case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; #ifdef CONFIG_X86_64 case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; #endif default: BUG(); } } static inline void _kvm_write_sse_reg(int reg, const sse128_t *data) { switch (reg) { case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; #ifdef CONFIG_X86_64 case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; #endif default: BUG(); } } static inline void _kvm_read_mmx_reg(int reg, u64 *data) { switch (reg) { case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; default: BUG(); } } static inline void _kvm_write_mmx_reg(int reg, const u64 *data) { switch (reg) { case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; default: BUG(); } } static inline void kvm_fpu_get(void) { fpregs_lock(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); } static inline void kvm_fpu_put(void) { fpregs_unlock(); } static inline void kvm_read_sse_reg(int reg, sse128_t *data) { kvm_fpu_get(); _kvm_read_sse_reg(reg, data); kvm_fpu_put(); } static inline void kvm_write_sse_reg(int reg, const sse128_t *data) { kvm_fpu_get(); _kvm_write_sse_reg(reg, data); kvm_fpu_put(); } static inline void kvm_read_mmx_reg(int reg, u64 *data) { kvm_fpu_get(); _kvm_read_mmx_reg(reg, data); kvm_fpu_put(); } static inline void kvm_write_mmx_reg(int reg, const u64 *data) { kvm_fpu_get(); _kvm_write_mmx_reg(reg, data); kvm_fpu_put(); } #endif |
| 436 422 436 436 29 266 14 266 43 304 283 435 435 436 436 436 424 436 266 424 304 265 19 303 304 266 283 2 2 2 2 436 434 433 29 2 29 280 29 436 435 436 433 53 435 417 49 436 448 245 334 197 336 18 12 181 298 1 434 296 17 8 6 4 5 7 13 14 6 6 6 177 402 265 170 20 12 6 2 3 7 18 18 9 6 5 435 436 433 12 86 365 355 3 362 6 433 4 433 436 436 1 435 428 424 423 36 436 3 27 139 438 18 12 433 436 436 435 436 436 435 435 12 12 12 434 436 436 435 12 5 164 180 245 393 58 12 433 246 310 12 433 168 334 401 401 42 2 182 90 43 175 168 7 354 215 75 173 37 1 189 182 131 157 206 3 4 17 104 105 26 105 21 77 77 9 18 77 59 12 69 13 37 61 110 58 4 58 53 12 7 5 84 84 68 104 77 38 38 30 50 6 11 2 48 80 2 3 16 35 35 31 21 37 37 70 9 6 8 6 1 2 4 20 20 18 4 2 2 1 1 134 65 2 1 68 63 20 44 1 55 1 6 68 5 58 19 1 104 42 24 4 40 38 3 35 2 49 1 29 6 45 43 2 41 2 1 105 2 8 100 8 8 8 8 103 2 1 34 6 98 98 6 95 2 2 4 22 33 16 9 28 4 9 13 7 15 28 15 4 26 1 1 1 1 2 2 3 3 4 90 67 23 5 84 60 3 7 2 1 5 11 14 5 7 58 4 11 68 3 3 3 16 1 2 5 8 13 13 1 1 2 6 3 6 9 5 1 4 5 5 3 2 19 1 1 11 6 4 2 7 2 5 8 126 2 126 126 126 101 101 126 74 104 2 84 47 84 3 3 126 125 2 128 47 84 41 91 9 119 128 128 103 47 57 103 103 384 1 1 4 14 38 9 32 1 7 35 2 1 1 4 5 1 1 2 13 94 2 1 2 5 36 2 2 6 1 16 17 30 1 2 1 2 3 29 1 4 13 1 385 82 82 111 112 47 90 34 61 61 61 19 78 78 78 47 56 63 23 4 15 4 22 1 3 13 2 5 2 2 4 9 39 34 35 39 35 35 35 35 13 21 1 35 35 35 41 39 33 33 33 33 39 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer / OSS compatible * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #if 0 #define PLUGIN_DEBUG #endif #if 0 #define OSS_DEBUG #endif #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/math64.h> #include <linux/string.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "pcm_plugin.h" #include <sound/info.h> #include <linux/soundcard.h> #include <sound/initval.h> #include <sound/mixer_oss.h> #define OSS_ALSAEMULVER _SIOR ('M', 249, int) static int dsp_map[SNDRV_CARDS]; static int adsp_map[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS-1)] = 1}; static bool nonblock_open = 1; MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, Abramo Bagnara <abramo@alsa-project.org>"); MODULE_DESCRIPTION("PCM OSS emulation for ALSA."); MODULE_LICENSE("GPL"); module_param_array(dsp_map, int, NULL, 0444); MODULE_PARM_DESC(dsp_map, "PCM device number assigned to 1st OSS device."); module_param_array(adsp_map, int, NULL, 0444); MODULE_PARM_DESC(adsp_map, "PCM device number assigned to 2nd OSS device."); module_param(nonblock_open, bool, 0644); MODULE_PARM_DESC(nonblock_open, "Don't block opening busy PCM devices."); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_PCM); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_PCM1); static int snd_pcm_oss_get_rate(struct snd_pcm_oss_file *pcm_oss_file); static int snd_pcm_oss_get_channels(struct snd_pcm_oss_file *pcm_oss_file); static int snd_pcm_oss_get_format(struct snd_pcm_oss_file *pcm_oss_file); /* * helper functions to process hw_params */ static int snd_interval_refine_min(struct snd_interval *i, unsigned int min, int openmin) { int changed = 0; if (i->min < min) { i->min = min; i->openmin = openmin; changed = 1; } else if (i->min == min && !i->openmin && openmin) { i->openmin = 1; changed = 1; } if (i->integer) { if (i->openmin) { i->min++; i->openmin = 0; } } if (snd_interval_checkempty(i)) { snd_interval_none(i); return -EINVAL; } return changed; } static int snd_interval_refine_max(struct snd_interval *i, unsigned int max, int openmax) { int changed = 0; if (i->max > max) { i->max = max; i->openmax = openmax; changed = 1; } else if (i->max == max && !i->openmax && openmax) { i->openmax = 1; changed = 1; } if (i->integer) { if (i->openmax) { i->max--; i->openmax = 0; } } if (snd_interval_checkempty(i)) { snd_interval_none(i); return -EINVAL; } return changed; } static int snd_interval_refine_set(struct snd_interval *i, unsigned int val) { struct snd_interval t; t.empty = 0; t.min = t.max = val; t.openmin = t.openmax = 0; t.integer = 1; return snd_interval_refine(i, &t); } /** * snd_pcm_hw_param_value_min * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or NULL * * Return the minimum value for field PAR. */ static unsigned int snd_pcm_hw_param_value_min(const struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { if (hw_is_mask(var)) { if (dir) *dir = 0; return snd_mask_min(hw_param_mask_c(params, var)); } if (hw_is_interval(var)) { const struct snd_interval *i = hw_param_interval_c(params, var); if (dir) *dir = i->openmin; return snd_interval_min(i); } return -EINVAL; } /** * snd_pcm_hw_param_value_max * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or NULL * * Return the maximum value for field PAR. */ static int snd_pcm_hw_param_value_max(const struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { if (hw_is_mask(var)) { if (dir) *dir = 0; return snd_mask_max(hw_param_mask_c(params, var)); } if (hw_is_interval(var)) { const struct snd_interval *i = hw_param_interval_c(params, var); if (dir) *dir = - (int) i->openmax; return snd_interval_max(i); } return -EINVAL; } static int _snd_pcm_hw_param_mask(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, const struct snd_mask *val) { int changed; changed = snd_mask_refine(hw_param_mask(params, var), val); if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } static int snd_pcm_hw_param_mask(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, const struct snd_mask *val) { int changed = _snd_pcm_hw_param_mask(params, var, val); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return 0; } static int _snd_pcm_hw_param_min(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed; int open = 0; if (dir) { if (dir > 0) { open = 1; } else if (dir < 0) { if (val > 0) { open = 1; val--; } } } if (hw_is_mask(var)) changed = snd_mask_refine_min(hw_param_mask(params, var), val + !!open); else if (hw_is_interval(var)) changed = snd_interval_refine_min(hw_param_interval(params, var), val, open); else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_min * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @val: minimal value * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS remove from PAR all * values < VAL. Reduce configuration space accordingly. * Return new minimum or -EINVAL if the configuration space is empty */ static int snd_pcm_hw_param_min(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int *dir) { int changed = _snd_pcm_hw_param_min(params, var, val, dir ? *dir : 0); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value_min(params, var, dir); } static int _snd_pcm_hw_param_max(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed; int open = 0; if (dir) { if (dir < 0) { open = 1; } else if (dir > 0) { open = 1; val++; } } if (hw_is_mask(var)) { if (val == 0 && open) { snd_mask_none(hw_param_mask(params, var)); changed = -EINVAL; } else changed = snd_mask_refine_max(hw_param_mask(params, var), val - !!open); } else if (hw_is_interval(var)) changed = snd_interval_refine_max(hw_param_interval(params, var), val, open); else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_max * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @val: maximal value * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS remove from PAR all * values >= VAL + 1. Reduce configuration space accordingly. * Return new maximum or -EINVAL if the configuration space is empty */ static int snd_pcm_hw_param_max(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int *dir) { int changed = _snd_pcm_hw_param_max(params, var, val, dir ? *dir : 0); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value_max(params, var, dir); } static int boundary_sub(int a, int adir, int b, int bdir, int *c, int *cdir) { adir = adir < 0 ? -1 : (adir > 0 ? 1 : 0); bdir = bdir < 0 ? -1 : (bdir > 0 ? 1 : 0); *c = a - b; *cdir = adir - bdir; if (*cdir == -2) { (*c)--; } else if (*cdir == 2) { (*c)++; } return 0; } static int boundary_lt(unsigned int a, int adir, unsigned int b, int bdir) { if (adir < 0) { a--; adir = 1; } else if (adir > 0) adir = 1; if (bdir < 0) { b--; bdir = 1; } else if (bdir > 0) bdir = 1; return a < b || (a == b && adir < bdir); } /* Return 1 if min is nearer to best than max */ static int boundary_nearer(int min, int mindir, int best, int bestdir, int max, int maxdir) { int dmin, dmindir; int dmax, dmaxdir; boundary_sub(best, bestdir, min, mindir, &dmin, &dmindir); boundary_sub(max, maxdir, best, bestdir, &dmax, &dmaxdir); return boundary_lt(dmin, dmindir, dmax, dmaxdir); } /** * snd_pcm_hw_param_near * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @best: value to set * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS set PAR to the available value * nearest to VAL. Reduce configuration space accordingly. * This function cannot be called for SNDRV_PCM_HW_PARAM_ACCESS, * SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SUBFORMAT. * Return the value found. */ static int snd_pcm_hw_param_near(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int best, int *dir) { struct snd_pcm_hw_params *save __free(kfree) = NULL; int v; unsigned int saved_min; int last = 0; int min, max; int mindir, maxdir; int valdir = dir ? *dir : 0; /* FIXME */ if (best > INT_MAX) best = INT_MAX; min = max = best; mindir = maxdir = valdir; if (maxdir > 0) maxdir = 0; else if (maxdir == 0) maxdir = -1; else { maxdir = 1; max--; } save = kmalloc(sizeof(*save), GFP_KERNEL); if (save == NULL) return -ENOMEM; *save = *params; saved_min = min; min = snd_pcm_hw_param_min(pcm, params, var, min, &mindir); if (min >= 0) { struct snd_pcm_hw_params *params1 __free(kfree) = NULL; if (max < 0) goto _end; if ((unsigned int)min == saved_min && mindir == valdir) goto _end; params1 = kmalloc(sizeof(*params1), GFP_KERNEL); if (params1 == NULL) return -ENOMEM; *params1 = *save; max = snd_pcm_hw_param_max(pcm, params1, var, max, &maxdir); if (max < 0) goto _end; if (boundary_nearer(max, maxdir, best, valdir, min, mindir)) { *params = *params1; last = 1; } } else { *params = *save; max = snd_pcm_hw_param_max(pcm, params, var, max, &maxdir); if (max < 0) return max; last = 1; } _end: if (last) v = snd_pcm_hw_param_last(pcm, params, var, dir); else v = snd_pcm_hw_param_first(pcm, params, var, dir); return v; } static int _snd_pcm_hw_param_set(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed; if (hw_is_mask(var)) { struct snd_mask *m = hw_param_mask(params, var); if (val == 0 && dir < 0) { changed = -EINVAL; snd_mask_none(m); } else { if (dir > 0) val++; else if (dir < 0) val--; changed = snd_mask_refine_set(hw_param_mask(params, var), val); } } else if (hw_is_interval(var)) { struct snd_interval *i = hw_param_interval(params, var); if (val == 0 && dir < 0) { changed = -EINVAL; snd_interval_none(i); } else if (dir == 0) changed = snd_interval_refine_set(i, val); else { struct snd_interval t; t.openmin = 1; t.openmax = 1; t.empty = 0; t.integer = 0; if (dir < 0) { t.min = val - 1; t.max = val; } else { t.min = val; t.max = val+1; } changed = snd_interval_refine(i, &t); } } else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_set * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @val: value to set * @dir: pointer to the direction (-1,0,1) or NULL * * Inside configuration space defined by PARAMS remove from PAR all * values != VAL. Reduce configuration space accordingly. * Return VAL or -EINVAL if the configuration space is empty */ static int snd_pcm_hw_param_set(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, unsigned int val, int dir) { int changed = _snd_pcm_hw_param_set(params, var, val, dir); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value(params, var, NULL); } static int _snd_pcm_hw_param_setinteger(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var) { int changed; changed = snd_interval_setinteger(hw_param_interval(params, var)); if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /* * plugin */ #ifdef CONFIG_SND_PCM_OSS_PLUGINS static int snd_pcm_oss_plugin_clear(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_plugin *plugin, *next; plugin = runtime->oss.plugin_first; while (plugin) { next = plugin->next; snd_pcm_plugin_free(plugin); plugin = next; } runtime->oss.plugin_first = runtime->oss.plugin_last = NULL; return 0; } static int snd_pcm_plugin_insert(struct snd_pcm_plugin *plugin) { struct snd_pcm_runtime *runtime = plugin->plug->runtime; plugin->next = runtime->oss.plugin_first; plugin->prev = NULL; if (runtime->oss.plugin_first) { runtime->oss.plugin_first->prev = plugin; runtime->oss.plugin_first = plugin; } else { runtime->oss.plugin_last = runtime->oss.plugin_first = plugin; } return 0; } int snd_pcm_plugin_append(struct snd_pcm_plugin *plugin) { struct snd_pcm_runtime *runtime = plugin->plug->runtime; plugin->next = NULL; plugin->prev = runtime->oss.plugin_last; if (runtime->oss.plugin_last) { runtime->oss.plugin_last->next = plugin; runtime->oss.plugin_last = plugin; } else { runtime->oss.plugin_last = runtime->oss.plugin_first = plugin; } return 0; } #endif /* CONFIG_SND_PCM_OSS_PLUGINS */ static long snd_pcm_oss_bytes(struct snd_pcm_substream *substream, long frames) { struct snd_pcm_runtime *runtime = substream->runtime; long buffer_size = snd_pcm_lib_buffer_bytes(substream); long bytes = frames_to_bytes(runtime, frames); if (buffer_size == runtime->oss.buffer_bytes) return bytes; #if BITS_PER_LONG >= 64 return runtime->oss.buffer_bytes * bytes / buffer_size; #else { u64 bsize = (u64)runtime->oss.buffer_bytes * (u64)bytes; return div_u64(bsize, buffer_size); } #endif } static long snd_pcm_alsa_frames(struct snd_pcm_substream *substream, long bytes) { struct snd_pcm_runtime *runtime = substream->runtime; long buffer_size = snd_pcm_lib_buffer_bytes(substream); if (buffer_size == runtime->oss.buffer_bytes) return bytes_to_frames(runtime, bytes); return bytes_to_frames(runtime, (buffer_size * bytes) / runtime->oss.buffer_bytes); } static inline snd_pcm_uframes_t get_hw_ptr_period(struct snd_pcm_runtime *runtime) { return runtime->hw_ptr_interrupt; } /* define extended formats in the recent OSS versions (if any) */ /* linear formats */ #define AFMT_S32_LE 0x00001000 #define AFMT_S32_BE 0x00002000 #define AFMT_S24_LE 0x00008000 #define AFMT_S24_BE 0x00010000 #define AFMT_S24_PACKED 0x00040000 /* other supported formats */ #define AFMT_FLOAT 0x00004000 #define AFMT_SPDIF_RAW 0x00020000 /* unsupported formats */ #define AFMT_AC3 0x00000400 #define AFMT_VORBIS 0x00000800 static snd_pcm_format_t snd_pcm_oss_format_from(int format) { switch (format) { case AFMT_MU_LAW: return SNDRV_PCM_FORMAT_MU_LAW; case AFMT_A_LAW: return SNDRV_PCM_FORMAT_A_LAW; case AFMT_IMA_ADPCM: return SNDRV_PCM_FORMAT_IMA_ADPCM; case AFMT_U8: return SNDRV_PCM_FORMAT_U8; case AFMT_S16_LE: return SNDRV_PCM_FORMAT_S16_LE; case AFMT_S16_BE: return SNDRV_PCM_FORMAT_S16_BE; case AFMT_S8: return SNDRV_PCM_FORMAT_S8; case AFMT_U16_LE: return SNDRV_PCM_FORMAT_U16_LE; case AFMT_U16_BE: return SNDRV_PCM_FORMAT_U16_BE; case AFMT_MPEG: return SNDRV_PCM_FORMAT_MPEG; case AFMT_S32_LE: return SNDRV_PCM_FORMAT_S32_LE; case AFMT_S32_BE: return SNDRV_PCM_FORMAT_S32_BE; case AFMT_S24_LE: return SNDRV_PCM_FORMAT_S24_LE; case AFMT_S24_BE: return SNDRV_PCM_FORMAT_S24_BE; case AFMT_S24_PACKED: return SNDRV_PCM_FORMAT_S24_3LE; case AFMT_FLOAT: return SNDRV_PCM_FORMAT_FLOAT; case AFMT_SPDIF_RAW: return SNDRV_PCM_FORMAT_IEC958_SUBFRAME; default: return SNDRV_PCM_FORMAT_U8; } } static int snd_pcm_oss_format_to(snd_pcm_format_t format) { switch (format) { case SNDRV_PCM_FORMAT_MU_LAW: return AFMT_MU_LAW; case SNDRV_PCM_FORMAT_A_LAW: return AFMT_A_LAW; case SNDRV_PCM_FORMAT_IMA_ADPCM: return AFMT_IMA_ADPCM; case SNDRV_PCM_FORMAT_U8: return AFMT_U8; case SNDRV_PCM_FORMAT_S16_LE: return AFMT_S16_LE; case SNDRV_PCM_FORMAT_S16_BE: return AFMT_S16_BE; case SNDRV_PCM_FORMAT_S8: return AFMT_S8; case SNDRV_PCM_FORMAT_U16_LE: return AFMT_U16_LE; case SNDRV_PCM_FORMAT_U16_BE: return AFMT_U16_BE; case SNDRV_PCM_FORMAT_MPEG: return AFMT_MPEG; case SNDRV_PCM_FORMAT_S32_LE: return AFMT_S32_LE; case SNDRV_PCM_FORMAT_S32_BE: return AFMT_S32_BE; case SNDRV_PCM_FORMAT_S24_LE: return AFMT_S24_LE; case SNDRV_PCM_FORMAT_S24_BE: return AFMT_S24_BE; case SNDRV_PCM_FORMAT_S24_3LE: return AFMT_S24_PACKED; case SNDRV_PCM_FORMAT_FLOAT: return AFMT_FLOAT; case SNDRV_PCM_FORMAT_IEC958_SUBFRAME: return AFMT_SPDIF_RAW; default: return -EINVAL; } } static int snd_pcm_oss_period_size(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *oss_params, struct snd_pcm_hw_params *slave_params) { ssize_t s; ssize_t oss_buffer_size; ssize_t oss_period_size, oss_periods; ssize_t min_period_size, max_period_size; struct snd_pcm_runtime *runtime = substream->runtime; size_t oss_frame_size; oss_frame_size = snd_pcm_format_physical_width(params_format(oss_params)) * params_channels(oss_params) / 8; oss_buffer_size = snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, NULL); if (oss_buffer_size <= 0) return -EINVAL; oss_buffer_size = snd_pcm_plug_client_size(substream, oss_buffer_size * oss_frame_size); if (oss_buffer_size <= 0) return -EINVAL; oss_buffer_size = rounddown_pow_of_two(oss_buffer_size); if (atomic_read(&substream->mmap_count)) { if (oss_buffer_size > runtime->oss.mmap_bytes) oss_buffer_size = runtime->oss.mmap_bytes; } if (substream->oss.setup.period_size > 16) oss_period_size = substream->oss.setup.period_size; else if (runtime->oss.fragshift) { oss_period_size = 1 << runtime->oss.fragshift; if (oss_period_size > oss_buffer_size / 2) oss_period_size = oss_buffer_size / 2; } else { int sd; size_t bytes_per_sec = params_rate(oss_params) * snd_pcm_format_physical_width(params_format(oss_params)) * params_channels(oss_params) / 8; oss_period_size = oss_buffer_size; do { oss_period_size /= 2; } while (oss_period_size > bytes_per_sec); if (runtime->oss.subdivision == 0) { sd = 4; if (oss_period_size / sd > 4096) sd *= 2; if (oss_period_size / sd < 4096) sd = 1; } else sd = runtime->oss.subdivision; oss_period_size /= sd; if (oss_period_size < 16) oss_period_size = 16; } min_period_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL)); if (min_period_size > 0) { min_period_size *= oss_frame_size; min_period_size = roundup_pow_of_two(min_period_size); if (oss_period_size < min_period_size) oss_period_size = min_period_size; } max_period_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL)); if (max_period_size > 0) { max_period_size *= oss_frame_size; max_period_size = rounddown_pow_of_two(max_period_size); if (oss_period_size > max_period_size) oss_period_size = max_period_size; } oss_periods = oss_buffer_size / oss_period_size; if (substream->oss.setup.periods > 1) oss_periods = substream->oss.setup.periods; s = snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIODS, NULL); if (s > 0 && runtime->oss.maxfrags && s > runtime->oss.maxfrags) s = runtime->oss.maxfrags; if (oss_periods > s) oss_periods = s; s = snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIODS, NULL); if (s < 2) s = 2; if (oss_periods < s) oss_periods = s; while (oss_period_size * oss_periods > oss_buffer_size) oss_period_size /= 2; if (oss_period_size < 16) return -EINVAL; /* don't allocate too large period; 1MB period must be enough */ if (oss_period_size > 1024 * 1024) return -ENOMEM; runtime->oss.period_bytes = oss_period_size; runtime->oss.period_frames = 1; runtime->oss.periods = oss_periods; return 0; } static int choose_rate(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, unsigned int best_rate) { const struct snd_interval *it; struct snd_pcm_hw_params *save __free(kfree) = NULL; unsigned int rate, prev; save = kmalloc(sizeof(*save), GFP_KERNEL); if (save == NULL) return -ENOMEM; *save = *params; it = hw_param_interval_c(save, SNDRV_PCM_HW_PARAM_RATE); /* try multiples of the best rate */ rate = best_rate; for (;;) { if (it->max < rate || (it->max == rate && it->openmax)) break; if (it->min < rate || (it->min == rate && !it->openmin)) { int ret; ret = snd_pcm_hw_param_set(substream, params, SNDRV_PCM_HW_PARAM_RATE, rate, 0); if (ret == (int)rate) return rate; *params = *save; } prev = rate; rate += best_rate; if (rate <= prev) break; } /* not found, use the nearest rate */ return snd_pcm_hw_param_near(substream, params, SNDRV_PCM_HW_PARAM_RATE, best_rate, NULL); } /* parameter locking: returns immediately if tried during streaming */ static int lock_params(struct snd_pcm_runtime *runtime) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (atomic_read(&runtime->oss.rw_ref)) { mutex_unlock(&runtime->oss.params_lock); return -EBUSY; } return 0; } static void unlock_params(struct snd_pcm_runtime *runtime) { mutex_unlock(&runtime->oss.params_lock); } static void snd_pcm_oss_release_buffers(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; kvfree(runtime->oss.buffer); runtime->oss.buffer = NULL; #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_oss_plugin_clear(substream); #endif } /* call with params_lock held */ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hw_params *params, *sparams; struct snd_pcm_sw_params *sw_params; ssize_t oss_buffer_size, oss_period_size; size_t oss_frame_size; int err; int direct; snd_pcm_format_t format, sformat; int n; const struct snd_mask *sformat_mask; struct snd_mask mask; if (!runtime->oss.params) return 0; sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL); params = kmalloc(sizeof(*params), GFP_KERNEL); sparams = kmalloc(sizeof(*sparams), GFP_KERNEL); if (!sw_params || !params || !sparams) { err = -ENOMEM; goto failure; } if (atomic_read(&substream->mmap_count)) direct = 1; else direct = substream->oss.setup.direct; _snd_pcm_hw_params_any(sparams); _snd_pcm_hw_param_setinteger(sparams, SNDRV_PCM_HW_PARAM_PERIODS); _snd_pcm_hw_param_min(sparams, SNDRV_PCM_HW_PARAM_PERIODS, 2, 0); snd_mask_none(&mask); if (atomic_read(&substream->mmap_count)) snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_MMAP_INTERLEAVED); else { snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_RW_INTERLEAVED); if (!direct) snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); } err = snd_pcm_hw_param_mask(substream, sparams, SNDRV_PCM_HW_PARAM_ACCESS, &mask); if (err < 0) { pcm_dbg(substream->pcm, "No usable accesses\n"); err = -EINVAL; goto failure; } err = choose_rate(substream, sparams, runtime->oss.rate); if (err < 0) goto failure; err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_CHANNELS, runtime->oss.channels, NULL); if (err < 0) goto failure; format = snd_pcm_oss_format_from(runtime->oss.format); sformat_mask = hw_param_mask_c(sparams, SNDRV_PCM_HW_PARAM_FORMAT); if (direct) sformat = format; else sformat = snd_pcm_plug_slave_format(format, sformat_mask); if ((__force int)sformat < 0 || !snd_mask_test_format(sformat_mask, sformat)) { pcm_for_each_format(sformat) { if (snd_mask_test_format(sformat_mask, sformat) && snd_pcm_oss_format_to(sformat) >= 0) goto format_found; } pcm_dbg(substream->pcm, "Cannot find a format!!!\n"); err = -EINVAL; goto failure; } format_found: err = _snd_pcm_hw_param_set(sparams, SNDRV_PCM_HW_PARAM_FORMAT, (__force int)sformat, 0); if (err < 0) goto failure; if (direct) { memcpy(params, sparams, sizeof(*params)); } else { _snd_pcm_hw_params_any(params); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_ACCESS, (__force int)SNDRV_PCM_ACCESS_RW_INTERLEAVED, 0); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_FORMAT, (__force int)snd_pcm_oss_format_from(runtime->oss.format), 0); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_CHANNELS, runtime->oss.channels, 0); _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_RATE, runtime->oss.rate, 0); pdprintf("client: access = %i, format = %i, channels = %i, rate = %i\n", params_access(params), params_format(params), params_channels(params), params_rate(params)); } pdprintf("slave: access = %i, format = %i, channels = %i, rate = %i\n", params_access(sparams), params_format(sparams), params_channels(sparams), params_rate(sparams)); oss_frame_size = snd_pcm_format_physical_width(params_format(params)) * params_channels(params) / 8; err = snd_pcm_oss_period_size(substream, params, sparams); if (err < 0) goto failure; n = snd_pcm_plug_slave_size(substream, runtime->oss.period_bytes / oss_frame_size); err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, n, NULL); if (err < 0) goto failure; err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIODS, runtime->oss.periods, NULL); if (err < 0) goto failure; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_HW_PARAMS, sparams); if (err < 0) { pcm_dbg(substream->pcm, "HW_PARAMS failed: %i\n", err); goto failure; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_oss_plugin_clear(substream); if (!direct) { /* add necessary plugins */ err = snd_pcm_plug_format_plugins(substream, params, sparams); if (err < 0) { pcm_dbg(substream->pcm, "snd_pcm_plug_format_plugins failed: %i\n", err); goto failure; } if (runtime->oss.plugin_first) { struct snd_pcm_plugin *plugin; err = snd_pcm_plugin_build_io(substream, sparams, &plugin); if (err < 0) { pcm_dbg(substream->pcm, "snd_pcm_plugin_build_io failed: %i\n", err); goto failure; } if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { err = snd_pcm_plugin_append(plugin); } else { err = snd_pcm_plugin_insert(plugin); } if (err < 0) goto failure; } } #endif if (runtime->oss.trigger) { sw_params->start_threshold = 1; } else { sw_params->start_threshold = runtime->boundary; } if (atomic_read(&substream->mmap_count) || substream->stream == SNDRV_PCM_STREAM_CAPTURE) sw_params->stop_threshold = runtime->boundary; else sw_params->stop_threshold = runtime->buffer_size; sw_params->tstamp_mode = SNDRV_PCM_TSTAMP_NONE; sw_params->period_step = 1; sw_params->avail_min = substream->stream == SNDRV_PCM_STREAM_PLAYBACK ? 1 : runtime->period_size; if (atomic_read(&substream->mmap_count) || substream->oss.setup.nosilence) { sw_params->silence_threshold = 0; sw_params->silence_size = 0; } else { snd_pcm_uframes_t frames; frames = runtime->period_size + 16; if (frames > runtime->buffer_size) frames = runtime->buffer_size; sw_params->silence_threshold = frames; sw_params->silence_size = frames; } err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_SW_PARAMS, sw_params); if (err < 0) { pcm_dbg(substream->pcm, "SW_PARAMS failed: %i\n", err); goto failure; } runtime->oss.periods = params_periods(sparams); oss_period_size = snd_pcm_plug_client_size(substream, params_period_size(sparams)); if (oss_period_size < 0) { err = -EINVAL; goto failure; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS if (runtime->oss.plugin_first) { err = snd_pcm_plug_alloc(substream, oss_period_size); if (err < 0) goto failure; } #endif oss_period_size = array_size(oss_period_size, oss_frame_size); oss_buffer_size = array_size(oss_period_size, runtime->oss.periods); if (oss_buffer_size <= 0) { err = -EINVAL; goto failure; } runtime->oss.period_bytes = oss_period_size; runtime->oss.buffer_bytes = oss_buffer_size; pdprintf("oss: period bytes = %i, buffer bytes = %i\n", runtime->oss.period_bytes, runtime->oss.buffer_bytes); pdprintf("slave: period_size = %i, buffer_size = %i\n", params_period_size(sparams), params_buffer_size(sparams)); runtime->oss.format = snd_pcm_oss_format_to(params_format(params)); runtime->oss.channels = params_channels(params); runtime->oss.rate = params_rate(params); kvfree(runtime->oss.buffer); runtime->oss.buffer = kvzalloc(runtime->oss.period_bytes, GFP_KERNEL); if (!runtime->oss.buffer) { err = -ENOMEM; goto failure; } runtime->oss.params = 0; runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; snd_pcm_runtime_buffer_set_silence(runtime); runtime->oss.period_frames = snd_pcm_alsa_frames(substream, oss_period_size); err = 0; failure: if (err) snd_pcm_oss_release_buffers(substream); kfree(sw_params); kfree(params); kfree(sparams); return err; } /* this one takes the lock by itself */ static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream, bool trylock) { struct snd_pcm_runtime *runtime = substream->runtime; int err; if (trylock) { if (!(mutex_trylock(&runtime->oss.params_lock))) return -EAGAIN; } else if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; err = snd_pcm_oss_change_params_locked(substream); mutex_unlock(&runtime->oss.params_lock); return err; } static int snd_pcm_oss_get_active_substream(struct snd_pcm_oss_file *pcm_oss_file, struct snd_pcm_substream **r_substream) { int idx, err; struct snd_pcm_substream *asubstream = NULL, *substream; for (idx = 0; idx < 2; idx++) { substream = pcm_oss_file->streams[idx]; if (substream == NULL) continue; if (asubstream == NULL) asubstream = substream; if (substream->runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } } if (!asubstream) return -EIO; if (r_substream) *r_substream = asubstream; return 0; } /* call with params_lock held */ /* NOTE: this always call PREPARE unconditionally no matter whether * runtime->oss.prepare is set or not */ static int snd_pcm_oss_prepare(struct snd_pcm_substream *substream) { int err; struct snd_pcm_runtime *runtime = substream->runtime; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_PREPARE, NULL); if (err < 0) { pcm_dbg(substream->pcm, "snd_pcm_oss_prepare: SNDRV_PCM_IOCTL_PREPARE failed\n"); return err; } runtime->oss.prepare = 0; runtime->oss.prev_hw_ptr_period = 0; runtime->oss.period_ptr = 0; runtime->oss.buffer_used = 0; return 0; } static int snd_pcm_oss_make_ready(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int err; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } if (runtime->oss.prepare) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; err = snd_pcm_oss_prepare(substream); mutex_unlock(&runtime->oss.params_lock); if (err < 0) return err; } return 0; } /* call with params_lock held */ static int snd_pcm_oss_make_ready_locked(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int err; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params_locked(substream); if (err < 0) return err; } if (runtime->oss.prepare) { err = snd_pcm_oss_prepare(substream); if (err < 0) return err; } return 0; } static int snd_pcm_oss_capture_position_fixup(struct snd_pcm_substream *substream, snd_pcm_sframes_t *delay) { struct snd_pcm_runtime *runtime; snd_pcm_uframes_t frames; int err = 0; while (1) { err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, delay); if (err < 0) break; runtime = substream->runtime; if (*delay <= (snd_pcm_sframes_t)runtime->buffer_size) break; /* in case of overrun, skip whole periods like OSS/Linux driver does */ /* until avail(delay) <= buffer_size */ frames = (*delay - runtime->buffer_size) + runtime->period_size - 1; frames /= runtime->period_size; frames *= runtime->period_size; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_FORWARD, &frames); if (err < 0) break; } return err; } snd_pcm_sframes_t snd_pcm_oss_write3(struct snd_pcm_substream *substream, const char *ptr, snd_pcm_uframes_t frames, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: write: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } mutex_unlock(&runtime->oss.params_lock); ret = __snd_pcm_lib_xfer(substream, (void *)ptr, true, frames, in_kernel); mutex_lock(&runtime->oss.params_lock); if (ret != -EPIPE && ret != -ESTRPIPE) break; /* test, if we can't store new data, because the stream */ /* has not been started */ if (runtime->state == SNDRV_PCM_STATE_PREPARED) return -EAGAIN; } return ret; } snd_pcm_sframes_t snd_pcm_oss_read3(struct snd_pcm_substream *substream, char *ptr, snd_pcm_uframes_t frames, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t delay; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: read: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); if (ret < 0) break; } else if (runtime->state == SNDRV_PCM_STATE_SETUP) { ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } ret = snd_pcm_oss_capture_position_fixup(substream, &delay); if (ret < 0) break; mutex_unlock(&runtime->oss.params_lock); ret = __snd_pcm_lib_xfer(substream, (void *)ptr, true, frames, in_kernel); mutex_lock(&runtime->oss.params_lock); if (ret == -EPIPE) { if (runtime->state == SNDRV_PCM_STATE_DRAINING) { ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); if (ret < 0) break; } continue; } if (ret != -ESTRPIPE) break; } return ret; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS snd_pcm_sframes_t snd_pcm_oss_writev3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames) { struct snd_pcm_runtime *runtime = substream->runtime; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: writev: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } ret = snd_pcm_kernel_writev(substream, bufs, frames); if (ret != -EPIPE && ret != -ESTRPIPE) break; /* test, if we can't store new data, because the stream */ /* has not been started */ if (runtime->state == SNDRV_PCM_STATE_PREPARED) return -EAGAIN; } return ret; } snd_pcm_sframes_t snd_pcm_oss_readv3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames) { struct snd_pcm_runtime *runtime = substream->runtime; int ret; while (1) { if (runtime->state == SNDRV_PCM_STATE_XRUN || runtime->state == SNDRV_PCM_STATE_SUSPENDED) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: readv: recovering from %s\n", runtime->state == SNDRV_PCM_STATE_XRUN ? "XRUN" : "SUSPEND"); #endif ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); if (ret < 0) break; } else if (runtime->state == SNDRV_PCM_STATE_SETUP) { ret = snd_pcm_oss_prepare(substream); if (ret < 0) break; } ret = snd_pcm_kernel_readv(substream, bufs, frames); if (ret != -EPIPE && ret != -ESTRPIPE) break; } return ret; } #endif /* CONFIG_SND_PCM_OSS_PLUGINS */ static ssize_t snd_pcm_oss_write2(struct snd_pcm_substream *substream, const char *buf, size_t bytes, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t frames, frames1; #ifdef CONFIG_SND_PCM_OSS_PLUGINS if (runtime->oss.plugin_first) { struct snd_pcm_plugin_channel *channels; size_t oss_frame_bytes = (runtime->oss.plugin_first->src_width * runtime->oss.plugin_first->src_format.channels) / 8; if (!in_kernel) { if (copy_from_user(runtime->oss.buffer, (const char __force __user *)buf, bytes)) return -EFAULT; buf = runtime->oss.buffer; } frames = bytes / oss_frame_bytes; frames1 = snd_pcm_plug_client_channels_buf(substream, (char *)buf, frames, &channels); if (frames1 < 0) return frames1; frames1 = snd_pcm_plug_write_transfer(substream, channels, frames1); if (frames1 <= 0) return frames1; bytes = frames1 * oss_frame_bytes; } else #endif { frames = bytes_to_frames(runtime, bytes); frames1 = snd_pcm_oss_write3(substream, buf, frames, in_kernel); if (frames1 <= 0) return frames1; bytes = frames_to_bytes(runtime, frames1); } return bytes; } static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const char __user *buf, size_t bytes) { size_t xfer = 0; ssize_t tmp = 0; struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return -ENXIO; atomic_inc(&runtime->oss.rw_ref); while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; break; } tmp = snd_pcm_oss_make_ready_locked(substream); if (tmp < 0) goto err; if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { tmp = bytes; if (tmp + runtime->oss.buffer_used > runtime->oss.period_bytes) tmp = runtime->oss.period_bytes - runtime->oss.buffer_used; if (tmp > 0) { if (copy_from_user(runtime->oss.buffer + runtime->oss.buffer_used, buf, tmp)) { tmp = -EFAULT; goto err; } } runtime->oss.buffer_used += tmp; buf += tmp; bytes -= tmp; xfer += tmp; if (substream->oss.setup.partialfrag || runtime->oss.buffer_used == runtime->oss.period_bytes) { tmp = snd_pcm_oss_write2(substream, runtime->oss.buffer + runtime->oss.period_ptr, runtime->oss.buffer_used - runtime->oss.period_ptr, 1); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; runtime->oss.period_ptr += tmp; runtime->oss.period_ptr %= runtime->oss.period_bytes; if (runtime->oss.period_ptr == 0 || runtime->oss.period_ptr == runtime->oss.buffer_used) runtime->oss.buffer_used = 0; else if ((substream->f_flags & O_NONBLOCK) != 0) { tmp = -EAGAIN; goto err; } } } else { tmp = snd_pcm_oss_write2(substream, (const char __force *)buf, runtime->oss.period_bytes, 0); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; buf += tmp; bytes -= tmp; xfer += tmp; if ((substream->f_flags & O_NONBLOCK) != 0 && tmp != runtime->oss.period_bytes) tmp = -EAGAIN; } err: mutex_unlock(&runtime->oss.params_lock); if (tmp < 0) break; if (signal_pending(current)) { tmp = -ERESTARTSYS; break; } tmp = 0; } atomic_dec(&runtime->oss.rw_ref); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } static ssize_t snd_pcm_oss_read2(struct snd_pcm_substream *substream, char *buf, size_t bytes, int in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t frames, frames1; #ifdef CONFIG_SND_PCM_OSS_PLUGINS char __user *final_dst = (char __force __user *)buf; if (runtime->oss.plugin_first) { struct snd_pcm_plugin_channel *channels; size_t oss_frame_bytes = (runtime->oss.plugin_last->dst_width * runtime->oss.plugin_last->dst_format.channels) / 8; if (!in_kernel) buf = runtime->oss.buffer; frames = bytes / oss_frame_bytes; frames1 = snd_pcm_plug_client_channels_buf(substream, buf, frames, &channels); if (frames1 < 0) return frames1; frames1 = snd_pcm_plug_read_transfer(substream, channels, frames1); if (frames1 <= 0) return frames1; bytes = frames1 * oss_frame_bytes; if (!in_kernel && copy_to_user(final_dst, buf, bytes)) return -EFAULT; } else #endif { frames = bytes_to_frames(runtime, bytes); frames1 = snd_pcm_oss_read3(substream, buf, frames, in_kernel); if (frames1 <= 0) return frames1; bytes = frames_to_bytes(runtime, frames1); } return bytes; } static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __user *buf, size_t bytes) { size_t xfer = 0; ssize_t tmp = 0; struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return -ENXIO; atomic_inc(&runtime->oss.rw_ref); while (bytes > 0) { if (mutex_lock_interruptible(&runtime->oss.params_lock)) { tmp = -ERESTARTSYS; break; } tmp = snd_pcm_oss_make_ready_locked(substream); if (tmp < 0) goto err; if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { if (runtime->oss.buffer_used == 0) { tmp = snd_pcm_oss_read2(substream, runtime->oss.buffer, runtime->oss.period_bytes, 1); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; runtime->oss.period_ptr = tmp; runtime->oss.buffer_used = tmp; } tmp = bytes; if ((size_t) tmp > runtime->oss.buffer_used) tmp = runtime->oss.buffer_used; if (copy_to_user(buf, runtime->oss.buffer + (runtime->oss.period_ptr - runtime->oss.buffer_used), tmp)) { tmp = -EFAULT; goto err; } buf += tmp; bytes -= tmp; xfer += tmp; runtime->oss.buffer_used -= tmp; } else { tmp = snd_pcm_oss_read2(substream, (char __force *)buf, runtime->oss.period_bytes, 0); if (tmp <= 0) goto err; runtime->oss.bytes += tmp; buf += tmp; bytes -= tmp; xfer += tmp; } err: mutex_unlock(&runtime->oss.params_lock); if (tmp < 0) break; if (signal_pending(current)) { tmp = -ERESTARTSYS; break; } tmp = 0; } atomic_dec(&runtime->oss.rw_ref); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } static int snd_pcm_oss_reset(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; int i; for (i = 0; i < 2; i++) { substream = pcm_oss_file->streams[i]; if (!substream) continue; runtime = substream->runtime; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); mutex_lock(&runtime->oss.params_lock); runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; runtime->oss.prev_hw_ptr_period = 0; runtime->oss.period_ptr = 0; mutex_unlock(&runtime->oss.params_lock); } return 0; } static int snd_pcm_oss_post(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream != NULL) { err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_START, NULL); } /* note: all errors from the start action are ignored */ /* OSS apps do not know, how to handle them */ return 0; } static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size) { struct snd_pcm_runtime *runtime; ssize_t result = 0; snd_pcm_state_t state; long res; wait_queue_entry_t wait; runtime = substream->runtime; init_waitqueue_entry(&wait, current); add_wait_queue(&runtime->sleep, &wait); #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync1: size = %li\n", size); #endif while (1) { result = snd_pcm_oss_write2(substream, runtime->oss.buffer, size, 1); if (result > 0) { runtime->oss.buffer_used = 0; result = 0; break; } if (result != 0 && result != -EAGAIN) break; result = 0; set_current_state(TASK_INTERRUPTIBLE); scoped_guard(pcm_stream_lock_irq, substream) state = runtime->state; if (state != SNDRV_PCM_STATE_RUNNING) { set_current_state(TASK_RUNNING); break; } res = schedule_timeout(10 * HZ); if (signal_pending(current)) { result = -ERESTARTSYS; break; } if (res == 0) { pcm_err(substream->pcm, "OSS sync error - DMA timeout\n"); result = -EIO; break; } } remove_wait_queue(&runtime->sleep, &wait); return result; } static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) { int err = 0; unsigned int saved_f_flags; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_format_t format; unsigned long width; size_t size; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream != NULL) { runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) goto __direct; atomic_inc(&runtime->oss.rw_ref); if (mutex_lock_interruptible(&runtime->oss.params_lock)) { atomic_dec(&runtime->oss.rw_ref); return -ERESTARTSYS; } err = snd_pcm_oss_make_ready_locked(substream); if (err < 0) goto unlock; format = snd_pcm_oss_format_from(runtime->oss.format); width = snd_pcm_format_physical_width(format); if (runtime->oss.buffer_used > 0) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync: buffer_used\n"); #endif size = (8 * (runtime->oss.period_bytes - runtime->oss.buffer_used) + 7) / width; snd_pcm_format_set_silence(format, runtime->oss.buffer + runtime->oss.buffer_used, size); err = snd_pcm_oss_sync1(substream, runtime->oss.period_bytes); if (err < 0) goto unlock; } else if (runtime->oss.period_ptr > 0) { #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "sync: period_ptr\n"); #endif size = runtime->oss.period_bytes - runtime->oss.period_ptr; snd_pcm_format_set_silence(format, runtime->oss.buffer, size * 8 / width); err = snd_pcm_oss_sync1(substream, size); if (err < 0) goto unlock; } /* * The ALSA's period might be a bit large than OSS one. * Fill the remain portion of ALSA period with zeros. */ size = runtime->control->appl_ptr % runtime->period_size; if (size > 0) { size = runtime->period_size - size; if (runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED) snd_pcm_lib_write(substream, NULL, size); else if (runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) snd_pcm_lib_writev(substream, NULL, size); } unlock: mutex_unlock(&runtime->oss.params_lock); atomic_dec(&runtime->oss.rw_ref); if (err < 0) return err; /* * finish sync: drain the buffer */ __direct: saved_f_flags = substream->f_flags; substream->f_flags &= ~O_NONBLOCK; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL); substream->f_flags = saved_f_flags; if (err < 0) return err; mutex_lock(&runtime->oss.params_lock); runtime->oss.prepare = 1; mutex_unlock(&runtime->oss.params_lock); } substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (substream != NULL) { err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; runtime = substream->runtime; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL); if (err < 0) return err; mutex_lock(&runtime->oss.params_lock); runtime->oss.buffer_used = 0; runtime->oss.prepare = 1; mutex_unlock(&runtime->oss.params_lock); } return 0; } static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate) { int idx; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; int err; if (substream == NULL) continue; runtime = substream->runtime; if (rate < 1000) rate = 1000; else if (rate > 192000) rate = 192000; err = lock_params(runtime); if (err < 0) return err; if (runtime->oss.rate != rate) { runtime->oss.params = 1; runtime->oss.rate = rate; } unlock_params(runtime); } return snd_pcm_oss_get_rate(pcm_oss_file); } static int snd_pcm_oss_get_rate(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.rate; } static int snd_pcm_oss_set_channels(struct snd_pcm_oss_file *pcm_oss_file, unsigned int channels) { int idx; if (channels < 1) channels = 1; if (channels > 128) return -EINVAL; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; int err; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; if (runtime->oss.channels != channels) { runtime->oss.params = 1; runtime->oss.channels = channels; } unlock_params(runtime); } return snd_pcm_oss_get_channels(pcm_oss_file); } static int snd_pcm_oss_get_channels(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.channels; } static int snd_pcm_oss_get_block_size(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.period_bytes; } static int snd_pcm_oss_get_formats(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; int direct; struct snd_pcm_hw_params *params __free(kfree) = NULL; unsigned int formats = 0; const struct snd_mask *format_mask; int fmt; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; if (atomic_read(&substream->mmap_count)) direct = 1; else direct = substream->oss.setup.direct; if (!direct) return AFMT_MU_LAW | AFMT_U8 | AFMT_S16_LE | AFMT_S16_BE | AFMT_S8 | AFMT_U16_LE | AFMT_U16_BE | AFMT_S32_LE | AFMT_S32_BE | AFMT_S24_LE | AFMT_S24_BE | AFMT_S24_PACKED; params = kmalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; _snd_pcm_hw_params_any(params); err = snd_pcm_hw_refine(substream, params); if (err < 0) return err; format_mask = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); for (fmt = 0; fmt < 32; ++fmt) { if (snd_mask_test(format_mask, fmt)) { int f = snd_pcm_oss_format_to((__force snd_pcm_format_t)fmt); if (f >= 0) formats |= f; } } return formats; } static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int format) { int formats, idx; int err; if (format != AFMT_QUERY) { formats = snd_pcm_oss_get_formats(pcm_oss_file); if (formats < 0) return formats; if (!(formats & format)) format = AFMT_U8; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; if (runtime->oss.format != format) { runtime->oss.params = 1; runtime->oss.format = format; } unlock_params(runtime); } } return snd_pcm_oss_get_format(pcm_oss_file); } static int snd_pcm_oss_get_format(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; int err; err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream); if (err < 0) return err; return substream->runtime->oss.format; } static int snd_pcm_oss_set_subdivide1(struct snd_pcm_substream *substream, int subdivide) { struct snd_pcm_runtime *runtime; runtime = substream->runtime; if (subdivide == 0) { subdivide = runtime->oss.subdivision; if (subdivide == 0) subdivide = 1; return subdivide; } if (runtime->oss.subdivision || runtime->oss.fragshift) return -EINVAL; if (subdivide != 1 && subdivide != 2 && subdivide != 4 && subdivide != 8 && subdivide != 16) return -EINVAL; runtime->oss.subdivision = subdivide; runtime->oss.params = 1; return subdivide; } static int snd_pcm_oss_set_subdivide(struct snd_pcm_oss_file *pcm_oss_file, int subdivide) { int err = -EINVAL, idx; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; err = snd_pcm_oss_set_subdivide1(substream, subdivide); unlock_params(runtime); if (err < 0) return err; } return err; } static int snd_pcm_oss_set_fragment1(struct snd_pcm_substream *substream, unsigned int val) { struct snd_pcm_runtime *runtime; int fragshift; runtime = substream->runtime; if (runtime->oss.subdivision || runtime->oss.fragshift) return -EINVAL; fragshift = val & 0xffff; if (fragshift >= 25) /* should be large enough */ return -EINVAL; runtime->oss.fragshift = fragshift; runtime->oss.maxfrags = (val >> 16) & 0xffff; if (runtime->oss.fragshift < 4) /* < 16 */ runtime->oss.fragshift = 4; if (runtime->oss.maxfrags < 2) runtime->oss.maxfrags = 2; runtime->oss.params = 1; return 0; } static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsigned int val) { int err = -EINVAL, idx; for (idx = 1; idx >= 0; --idx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; struct snd_pcm_runtime *runtime; if (substream == NULL) continue; runtime = substream->runtime; err = lock_params(runtime); if (err < 0) return err; err = snd_pcm_oss_set_fragment1(substream, val); unlock_params(runtime); if (err < 0) return err; } return err; } static int snd_pcm_oss_nonblock(struct file * file) { guard(spinlock)(&file->f_lock); file->f_flags |= O_NONBLOCK; return 0; } static int snd_pcm_oss_get_caps1(struct snd_pcm_substream *substream, int res) { if (substream == NULL) { res &= ~DSP_CAP_DUPLEX; return res; } #ifdef DSP_CAP_MULTI if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) if (substream->pstr->substream_count > 1) res |= DSP_CAP_MULTI; #endif /* DSP_CAP_REALTIME is set all times: */ /* all ALSA drivers can return actual pointer in ring buffer */ #if defined(DSP_CAP_REALTIME) && 0 { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->info & (SNDRV_PCM_INFO_BLOCK_TRANSFER|SNDRV_PCM_INFO_BATCH)) res &= ~DSP_CAP_REALTIME; } #endif return res; } static int snd_pcm_oss_get_caps(struct snd_pcm_oss_file *pcm_oss_file) { int result, idx; result = DSP_CAP_TRIGGER | DSP_CAP_MMAP | DSP_CAP_DUPLEX | DSP_CAP_REALTIME; for (idx = 0; idx < 2; idx++) { struct snd_pcm_substream *substream = pcm_oss_file->streams[idx]; result = snd_pcm_oss_get_caps1(substream, result); } result |= 0x0001; /* revision - same as SB AWE 64 */ return result; } static void snd_pcm_oss_simulate_fill(struct snd_pcm_substream *substream, snd_pcm_uframes_t hw_ptr) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_uframes_t appl_ptr; appl_ptr = hw_ptr + runtime->buffer_size; appl_ptr %= runtime->boundary; runtime->control->appl_ptr = appl_ptr; } static int snd_pcm_oss_set_trigger(struct snd_pcm_oss_file *pcm_oss_file, int trigger) { struct snd_pcm_runtime *runtime; struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL; int err, cmd; #ifdef OSS_DEBUG pr_debug("pcm_oss: trigger = 0x%x\n", trigger); #endif psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (psubstream) { err = snd_pcm_oss_make_ready(psubstream); if (err < 0) return err; } if (csubstream) { err = snd_pcm_oss_make_ready(csubstream); if (err < 0) return err; } if (psubstream) { runtime = psubstream->runtime; cmd = 0; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (trigger & PCM_ENABLE_OUTPUT) { if (runtime->oss.trigger) goto _skip1; if (atomic_read(&psubstream->mmap_count)) snd_pcm_oss_simulate_fill(psubstream, get_hw_ptr_period(runtime)); runtime->oss.trigger = 1; runtime->start_threshold = 1; cmd = SNDRV_PCM_IOCTL_START; } else { if (!runtime->oss.trigger) goto _skip1; runtime->oss.trigger = 0; runtime->start_threshold = runtime->boundary; cmd = SNDRV_PCM_IOCTL_DROP; runtime->oss.prepare = 1; } _skip1: mutex_unlock(&runtime->oss.params_lock); if (cmd) { err = snd_pcm_kernel_ioctl(psubstream, cmd, NULL); if (err < 0) return err; } } if (csubstream) { runtime = csubstream->runtime; cmd = 0; if (mutex_lock_interruptible(&runtime->oss.params_lock)) return -ERESTARTSYS; if (trigger & PCM_ENABLE_INPUT) { if (runtime->oss.trigger) goto _skip2; runtime->oss.trigger = 1; runtime->start_threshold = 1; cmd = SNDRV_PCM_IOCTL_START; } else { if (!runtime->oss.trigger) goto _skip2; runtime->oss.trigger = 0; runtime->start_threshold = runtime->boundary; cmd = SNDRV_PCM_IOCTL_DROP; runtime->oss.prepare = 1; } _skip2: mutex_unlock(&runtime->oss.params_lock); if (cmd) { err = snd_pcm_kernel_ioctl(csubstream, cmd, NULL); if (err < 0) return err; } } return 0; } static int snd_pcm_oss_get_trigger(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL; int result = 0; psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (psubstream && psubstream->runtime && psubstream->runtime->oss.trigger) result |= PCM_ENABLE_OUTPUT; if (csubstream && csubstream->runtime && csubstream->runtime->oss.trigger) result |= PCM_ENABLE_INPUT; return result; } static int snd_pcm_oss_get_odelay(struct snd_pcm_oss_file *pcm_oss_file) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t delay; int err; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) return -EINVAL; err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; runtime = substream->runtime; if (runtime->oss.params || runtime->oss.prepare) return 0; err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &delay); if (err == -EPIPE) delay = 0; /* hack for broken OSS applications */ else if (err < 0) return err; return snd_pcm_oss_bytes(substream, delay); } static int snd_pcm_oss_get_ptr(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct count_info __user * _info) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t delay; int fixup; struct count_info info; int err; if (_info == NULL) return -EFAULT; substream = pcm_oss_file->streams[stream]; if (substream == NULL) return -EINVAL; err = snd_pcm_oss_make_ready(substream); if (err < 0) return err; runtime = substream->runtime; if (runtime->oss.params || runtime->oss.prepare) { memset(&info, 0, sizeof(info)); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } if (stream == SNDRV_PCM_STREAM_PLAYBACK) { err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &delay); if (err == -EPIPE || err == -ESTRPIPE || (! err && delay < 0)) { err = 0; delay = 0; fixup = 0; } else { fixup = runtime->oss.buffer_used; } } else { err = snd_pcm_oss_capture_position_fixup(substream, &delay); fixup = -runtime->oss.buffer_used; } if (err < 0) return err; info.ptr = snd_pcm_oss_bytes(substream, runtime->status->hw_ptr % runtime->buffer_size); if (atomic_read(&substream->mmap_count)) { snd_pcm_sframes_t n; delay = get_hw_ptr_period(runtime); n = delay - runtime->oss.prev_hw_ptr_period; if (n < 0) n += runtime->boundary; info.blocks = n / runtime->period_size; runtime->oss.prev_hw_ptr_period = delay; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) snd_pcm_oss_simulate_fill(substream, delay); info.bytes = snd_pcm_oss_bytes(substream, runtime->status->hw_ptr) & INT_MAX; } else { delay = snd_pcm_oss_bytes(substream, delay); if (stream == SNDRV_PCM_STREAM_PLAYBACK) { if (substream->oss.setup.buggyptr) info.blocks = (runtime->oss.buffer_bytes - delay - fixup) / runtime->oss.period_bytes; else info.blocks = (delay + fixup) / runtime->oss.period_bytes; info.bytes = (runtime->oss.bytes - delay) & INT_MAX; } else { delay += fixup; info.blocks = delay / runtime->oss.period_bytes; info.bytes = (runtime->oss.bytes + delay) & INT_MAX; } } if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_pcm_oss_get_space(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct audio_buf_info __user *_info) { struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t avail; int fixup; struct audio_buf_info info; int err; if (_info == NULL) return -EFAULT; substream = pcm_oss_file->streams[stream]; if (substream == NULL) return -EINVAL; runtime = substream->runtime; if (runtime->oss.params) { err = snd_pcm_oss_change_params(substream, false); if (err < 0) return err; } info.fragsize = runtime->oss.period_bytes; info.fragstotal = runtime->periods; if (runtime->oss.prepare) { if (stream == SNDRV_PCM_STREAM_PLAYBACK) { info.bytes = runtime->oss.period_bytes * runtime->oss.periods; info.fragments = runtime->oss.periods; } else { info.bytes = 0; info.fragments = 0; } } else { if (stream == SNDRV_PCM_STREAM_PLAYBACK) { err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &avail); if (err == -EPIPE || err == -ESTRPIPE || (! err && avail < 0)) { avail = runtime->buffer_size; err = 0; fixup = 0; } else { avail = runtime->buffer_size - avail; fixup = -runtime->oss.buffer_used; } } else { err = snd_pcm_oss_capture_position_fixup(substream, &avail); fixup = runtime->oss.buffer_used; } if (err < 0) return err; info.bytes = snd_pcm_oss_bytes(substream, avail) + fixup; info.fragments = info.bytes / runtime->oss.period_bytes; } #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: space: bytes = %i, fragments = %i, fragstotal = %i, fragsize = %i\n", info.bytes, info.fragments, info.fragstotal, info.fragsize); #endif if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_pcm_oss_get_mapbuf(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct buffmem_desc __user * _info) { // it won't be probably implemented // pr_debug("TODO: snd_pcm_oss_get_mapbuf\n"); return -EINVAL; } static const char *strip_task_path(const char *path) { const char *ptr, *ptrl = NULL; for (ptr = path; *ptr; ptr++) { if (*ptr == '/') ptrl = ptr + 1; } return ptrl; } static void snd_pcm_oss_look_for_setup(struct snd_pcm *pcm, int stream, const char *task_name, struct snd_pcm_oss_setup *rsetup) { struct snd_pcm_oss_setup *setup; guard(mutex)(&pcm->streams[stream].oss.setup_mutex); do { for (setup = pcm->streams[stream].oss.setup_list; setup; setup = setup->next) { if (!strcmp(setup->task_name, task_name)) goto out; } } while ((task_name = strip_task_path(task_name)) != NULL); out: if (setup) *rsetup = *setup; } static void snd_pcm_oss_release_substream(struct snd_pcm_substream *substream) { snd_pcm_oss_release_buffers(substream); substream->oss.oss = 0; } static void snd_pcm_oss_init_substream(struct snd_pcm_substream *substream, struct snd_pcm_oss_setup *setup, int minor) { struct snd_pcm_runtime *runtime; substream->oss.oss = 1; substream->oss.setup = *setup; if (setup->nonblock) substream->f_flags |= O_NONBLOCK; else if (setup->block) substream->f_flags &= ~O_NONBLOCK; runtime = substream->runtime; runtime->oss.params = 1; runtime->oss.trigger = 1; runtime->oss.rate = 8000; mutex_init(&runtime->oss.params_lock); switch (SNDRV_MINOR_OSS_DEVICE(minor)) { case SNDRV_MINOR_OSS_PCM_8: runtime->oss.format = AFMT_U8; break; case SNDRV_MINOR_OSS_PCM_16: runtime->oss.format = AFMT_S16_LE; break; default: runtime->oss.format = AFMT_MU_LAW; } runtime->oss.channels = 1; runtime->oss.fragshift = 0; runtime->oss.maxfrags = 0; runtime->oss.subdivision = 0; substream->pcm_release = snd_pcm_oss_release_substream; atomic_set(&runtime->oss.rw_ref, 0); } static int snd_pcm_oss_release_file(struct snd_pcm_oss_file *pcm_oss_file) { int cidx; if (!pcm_oss_file) return 0; for (cidx = 0; cidx < 2; ++cidx) { struct snd_pcm_substream *substream = pcm_oss_file->streams[cidx]; if (substream) snd_pcm_release_substream(substream); } kfree(pcm_oss_file); return 0; } static int snd_pcm_oss_open_file(struct file *file, struct snd_pcm *pcm, struct snd_pcm_oss_file **rpcm_oss_file, int minor, struct snd_pcm_oss_setup *setup) { int idx, err; struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream; fmode_t f_mode = file->f_mode; if (rpcm_oss_file) *rpcm_oss_file = NULL; pcm_oss_file = kzalloc(sizeof(*pcm_oss_file), GFP_KERNEL); if (pcm_oss_file == NULL) return -ENOMEM; if ((f_mode & (FMODE_WRITE|FMODE_READ)) == (FMODE_WRITE|FMODE_READ) && (pcm->info_flags & SNDRV_PCM_INFO_HALF_DUPLEX)) f_mode = FMODE_WRITE; file->f_flags &= ~O_APPEND; for (idx = 0; idx < 2; idx++) { if (setup[idx].disable) continue; if (! pcm->streams[idx].substream_count) continue; /* no matching substream */ if (idx == SNDRV_PCM_STREAM_PLAYBACK) { if (! (f_mode & FMODE_WRITE)) continue; } else { if (! (f_mode & FMODE_READ)) continue; } err = snd_pcm_open_substream(pcm, idx, file, &substream); if (err < 0) { snd_pcm_oss_release_file(pcm_oss_file); return err; } pcm_oss_file->streams[idx] = substream; snd_pcm_oss_init_substream(substream, &setup[idx], minor); } if (!pcm_oss_file->streams[0] && !pcm_oss_file->streams[1]) { snd_pcm_oss_release_file(pcm_oss_file); return -EINVAL; } file->private_data = pcm_oss_file; if (rpcm_oss_file) *rpcm_oss_file = pcm_oss_file; return 0; } static int snd_task_name(struct task_struct *task, char *name, size_t size) { unsigned int idx; if (snd_BUG_ON(!task || !name || size < 2)) return -EINVAL; for (idx = 0; idx < sizeof(task->comm) && idx + 1 < size; idx++) name[idx] = task->comm[idx]; name[idx] = '\0'; return 0; } static int snd_pcm_oss_open(struct inode *inode, struct file *file) { int err; char task_name[32]; struct snd_pcm *pcm; struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_oss_setup setup[2]; int nonblock; wait_queue_entry_t wait; err = nonseekable_open(inode, file); if (err < 0) return err; pcm = snd_lookup_oss_minor_data(iminor(inode), SNDRV_OSS_DEVICE_TYPE_PCM); if (pcm == NULL) { err = -ENODEV; goto __error1; } err = snd_card_file_add(pcm->card, file); if (err < 0) goto __error1; if (!try_module_get(pcm->card->module)) { err = -EFAULT; goto __error2; } if (snd_task_name(current, task_name, sizeof(task_name)) < 0) { err = -EFAULT; goto __error; } memset(setup, 0, sizeof(setup)); if (file->f_mode & FMODE_WRITE) snd_pcm_oss_look_for_setup(pcm, SNDRV_PCM_STREAM_PLAYBACK, task_name, &setup[0]); if (file->f_mode & FMODE_READ) snd_pcm_oss_look_for_setup(pcm, SNDRV_PCM_STREAM_CAPTURE, task_name, &setup[1]); nonblock = !!(file->f_flags & O_NONBLOCK); if (!nonblock) nonblock = nonblock_open; init_waitqueue_entry(&wait, current); add_wait_queue(&pcm->open_wait, &wait); mutex_lock(&pcm->open_mutex); while (1) { err = snd_pcm_oss_open_file(file, pcm, &pcm_oss_file, iminor(inode), setup); if (err >= 0) break; if (err == -EAGAIN) { if (nonblock) { err = -EBUSY; break; } } else break; set_current_state(TASK_INTERRUPTIBLE); mutex_unlock(&pcm->open_mutex); schedule(); mutex_lock(&pcm->open_mutex); if (pcm->card->shutdown) { err = -ENODEV; break; } if (signal_pending(current)) { err = -ERESTARTSYS; break; } } remove_wait_queue(&pcm->open_wait, &wait); mutex_unlock(&pcm->open_mutex); if (err < 0) goto __error; snd_card_unref(pcm->card); return err; __error: module_put(pcm->card->module); __error2: snd_card_file_remove(pcm->card, file); __error1: if (pcm) snd_card_unref(pcm->card); return err; } static int snd_pcm_oss_release(struct inode *inode, struct file *file) { struct snd_pcm *pcm; struct snd_pcm_substream *substream; struct snd_pcm_oss_file *pcm_oss_file; pcm_oss_file = file->private_data; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (snd_BUG_ON(!substream)) return -ENXIO; pcm = substream->pcm; if (!pcm->card->shutdown) snd_pcm_oss_sync(pcm_oss_file); mutex_lock(&pcm->open_mutex); snd_pcm_oss_release_file(pcm_oss_file); mutex_unlock(&pcm->open_mutex); wake_up(&pcm->open_wait); module_put(pcm->card->module); snd_card_file_remove(pcm->card, file); return 0; } static long snd_pcm_oss_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_pcm_oss_file *pcm_oss_file; int __user *p = (int __user *)arg; int res; pcm_oss_file = file->private_data; if (cmd == OSS_GETVERSION) return put_user(SNDRV_OSS_VERSION, p); if (cmd == OSS_ALSAEMULVER) return put_user(1, p); #if IS_REACHABLE(CONFIG_SND_MIXER_OSS) if (((cmd >> 8) & 0xff) == 'M') { /* mixer ioctl - for OSS compatibility */ struct snd_pcm_substream *substream; int idx; for (idx = 0; idx < 2; ++idx) { substream = pcm_oss_file->streams[idx]; if (substream != NULL) break; } if (snd_BUG_ON(idx >= 2)) return -ENXIO; return snd_mixer_oss_ioctl_card(substream->pcm->card, cmd, arg); } #endif if (((cmd >> 8) & 0xff) != 'P') return -EINVAL; #ifdef OSS_DEBUG pr_debug("pcm_oss: ioctl = 0x%x\n", cmd); #endif switch (cmd) { case SNDCTL_DSP_RESET: return snd_pcm_oss_reset(pcm_oss_file); case SNDCTL_DSP_SYNC: return snd_pcm_oss_sync(pcm_oss_file); case SNDCTL_DSP_SPEED: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_rate(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_READ_RATE: res = snd_pcm_oss_get_rate(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_STEREO: if (get_user(res, p)) return -EFAULT; res = res > 0 ? 2 : 1; res = snd_pcm_oss_set_channels(pcm_oss_file, res); if (res < 0) return res; return put_user(--res, p); case SNDCTL_DSP_GETBLKSIZE: res = snd_pcm_oss_get_block_size(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_SETFMT: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_format(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_READ_BITS: res = snd_pcm_oss_get_format(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_CHANNELS: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_channels(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_READ_CHANNELS: res = snd_pcm_oss_get_channels(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SOUND_PCM_WRITE_FILTER: case SOUND_PCM_READ_FILTER: return -EIO; case SNDCTL_DSP_POST: return snd_pcm_oss_post(pcm_oss_file); case SNDCTL_DSP_SUBDIVIDE: if (get_user(res, p)) return -EFAULT; res = snd_pcm_oss_set_subdivide(pcm_oss_file, res); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_SETFRAGMENT: if (get_user(res, p)) return -EFAULT; return snd_pcm_oss_set_fragment(pcm_oss_file, res); case SNDCTL_DSP_GETFMTS: res = snd_pcm_oss_get_formats(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_GETOSPACE: case SNDCTL_DSP_GETISPACE: return snd_pcm_oss_get_space(pcm_oss_file, cmd == SNDCTL_DSP_GETISPACE ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK, (struct audio_buf_info __user *) arg); case SNDCTL_DSP_NONBLOCK: return snd_pcm_oss_nonblock(file); case SNDCTL_DSP_GETCAPS: res = snd_pcm_oss_get_caps(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_GETTRIGGER: res = snd_pcm_oss_get_trigger(pcm_oss_file); if (res < 0) return res; return put_user(res, p); case SNDCTL_DSP_SETTRIGGER: if (get_user(res, p)) return -EFAULT; return snd_pcm_oss_set_trigger(pcm_oss_file, res); case SNDCTL_DSP_GETIPTR: case SNDCTL_DSP_GETOPTR: return snd_pcm_oss_get_ptr(pcm_oss_file, cmd == SNDCTL_DSP_GETIPTR ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK, (struct count_info __user *) arg); case SNDCTL_DSP_MAPINBUF: case SNDCTL_DSP_MAPOUTBUF: return snd_pcm_oss_get_mapbuf(pcm_oss_file, cmd == SNDCTL_DSP_MAPINBUF ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK, (struct buffmem_desc __user *) arg); case SNDCTL_DSP_SETSYNCRO: /* stop DMA now.. */ return 0; case SNDCTL_DSP_SETDUPLEX: if (snd_pcm_oss_get_caps(pcm_oss_file) & DSP_CAP_DUPLEX) return 0; return -EIO; case SNDCTL_DSP_GETODELAY: res = snd_pcm_oss_get_odelay(pcm_oss_file); if (res < 0) { /* it's for sure, some broken apps don't check for error codes */ put_user(0, p); return res; } return put_user(res, p); case SNDCTL_DSP_PROFILE: return 0; /* silently ignore */ default: pr_debug("pcm_oss: unknown command = 0x%x\n", cmd); } return -EINVAL; } #ifdef CONFIG_COMPAT /* all compatible */ static long snd_pcm_oss_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { /* * Everything is compatbile except SNDCTL_DSP_MAPINBUF/SNDCTL_DSP_MAPOUTBUF, * which are not implemented for the native case either */ return snd_pcm_oss_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } #else #define snd_pcm_oss_ioctl_compat NULL #endif static ssize_t snd_pcm_oss_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream; pcm_oss_file = file->private_data; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; if (substream == NULL) return -ENXIO; substream->f_flags = file->f_flags & O_NONBLOCK; #ifndef OSS_DEBUG return snd_pcm_oss_read1(substream, buf, count); #else { ssize_t res = snd_pcm_oss_read1(substream, buf, count); pcm_dbg(substream->pcm, "pcm_oss: read %li bytes (returned %li bytes)\n", (long)count, (long)res); return res; } #endif } static ssize_t snd_pcm_oss_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream; long result; pcm_oss_file = file->private_data; substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) return -ENXIO; substream->f_flags = file->f_flags & O_NONBLOCK; result = snd_pcm_oss_write1(substream, buf, count); #ifdef OSS_DEBUG pcm_dbg(substream->pcm, "pcm_oss: write %li bytes (wrote %li bytes)\n", (long)count, (long)result); #endif return result; } static int snd_pcm_oss_playback_ready(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return runtime->oss.prev_hw_ptr_period != get_hw_ptr_period(runtime); else return snd_pcm_playback_avail(runtime) >= runtime->oss.period_frames; } static int snd_pcm_oss_capture_ready(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; if (atomic_read(&substream->mmap_count)) return runtime->oss.prev_hw_ptr_period != get_hw_ptr_period(runtime); else return snd_pcm_capture_avail(runtime) >= runtime->oss.period_frames; } static __poll_t snd_pcm_oss_poll(struct file *file, poll_table * wait) { struct snd_pcm_oss_file *pcm_oss_file; __poll_t mask; struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL; pcm_oss_file = file->private_data; psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; mask = 0; if (psubstream != NULL) { struct snd_pcm_runtime *runtime = psubstream->runtime; poll_wait(file, &runtime->sleep, wait); scoped_guard(pcm_stream_lock_irq, psubstream) { if (runtime->state != SNDRV_PCM_STATE_DRAINING && (runtime->state != SNDRV_PCM_STATE_RUNNING || snd_pcm_oss_playback_ready(psubstream))) mask |= EPOLLOUT | EPOLLWRNORM; } } if (csubstream != NULL) { struct snd_pcm_runtime *runtime = csubstream->runtime; snd_pcm_state_t ostate; poll_wait(file, &runtime->sleep, wait); scoped_guard(pcm_stream_lock_irq, csubstream) { ostate = runtime->state; if (ostate != SNDRV_PCM_STATE_RUNNING || snd_pcm_oss_capture_ready(csubstream)) mask |= EPOLLIN | EPOLLRDNORM; } if (ostate != SNDRV_PCM_STATE_RUNNING && runtime->oss.trigger) { struct snd_pcm_oss_file ofile; memset(&ofile, 0, sizeof(ofile)); ofile.streams[SNDRV_PCM_STREAM_CAPTURE] = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; runtime->oss.trigger = 0; snd_pcm_oss_set_trigger(&ofile, PCM_ENABLE_INPUT); } } return mask; } static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area) { struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_substream *substream = NULL; struct snd_pcm_runtime *runtime; int err; #ifdef OSS_DEBUG pr_debug("pcm_oss: mmap begin\n"); #endif pcm_oss_file = file->private_data; switch ((area->vm_flags & (VM_READ | VM_WRITE))) { case VM_READ | VM_WRITE: substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream) break; fallthrough; case VM_READ: substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE]; break; case VM_WRITE: substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; break; default: return -EINVAL; } /* set VM_READ access as well to fix memset() routines that do reads before writes (to improve performance) */ vm_flags_set(area, VM_READ); if (substream == NULL) return -ENXIO; runtime = substream->runtime; if (!(runtime->info & SNDRV_PCM_INFO_MMAP_VALID)) return -EIO; if (runtime->info & SNDRV_PCM_INFO_INTERLEAVED) runtime->access = SNDRV_PCM_ACCESS_MMAP_INTERLEAVED; else return -EIO; if (runtime->oss.params) { /* use mutex_trylock() for params_lock for avoiding a deadlock * between mmap_lock and params_lock taken by * copy_from/to_user() in snd_pcm_oss_write/read() */ err = snd_pcm_oss_change_params(substream, true); if (err < 0) return err; } #ifdef CONFIG_SND_PCM_OSS_PLUGINS if (runtime->oss.plugin_first != NULL) return -EIO; #endif if (area->vm_pgoff != 0) return -EINVAL; err = snd_pcm_mmap_data(substream, file, area); if (err < 0) return err; runtime->oss.mmap_bytes = area->vm_end - area->vm_start; runtime->silence_threshold = 0; runtime->silence_size = 0; #ifdef OSS_DEBUG pr_debug("pcm_oss: mmap ok, bytes = 0x%x\n", runtime->oss.mmap_bytes); #endif /* In mmap mode we never stop */ runtime->stop_threshold = runtime->boundary; return 0; } #ifdef CONFIG_SND_VERBOSE_PROCFS /* * /proc interface */ static void snd_pcm_oss_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_str *pstr = entry->private_data; struct snd_pcm_oss_setup *setup = pstr->oss.setup_list; guard(mutex)(&pstr->oss.setup_mutex); while (setup) { snd_iprintf(buffer, "%s %u %u%s%s%s%s%s%s\n", setup->task_name, setup->periods, setup->period_size, setup->disable ? " disable" : "", setup->direct ? " direct" : "", setup->block ? " block" : "", setup->nonblock ? " non-block" : "", setup->partialfrag ? " partial-frag" : "", setup->nosilence ? " no-silence" : ""); setup = setup->next; } } static void snd_pcm_oss_proc_free_setup_list(struct snd_pcm_str * pstr) { struct snd_pcm_oss_setup *setup, *setupn; for (setup = pstr->oss.setup_list, pstr->oss.setup_list = NULL; setup; setup = setupn) { setupn = setup->next; kfree(setup->task_name); kfree(setup); } pstr->oss.setup_list = NULL; } static void snd_pcm_oss_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_pcm_str *pstr = entry->private_data; char line[128], str[32], task_name[32]; const char *ptr; int idx1; struct snd_pcm_oss_setup *setup, *setup1, template; while (!snd_info_get_line(buffer, line, sizeof(line))) { guard(mutex)(&pstr->oss.setup_mutex); memset(&template, 0, sizeof(template)); ptr = snd_info_get_str(task_name, line, sizeof(task_name)); if (!strcmp(task_name, "clear") || !strcmp(task_name, "erase")) { snd_pcm_oss_proc_free_setup_list(pstr); continue; } for (setup = pstr->oss.setup_list; setup; setup = setup->next) { if (!strcmp(setup->task_name, task_name)) { template = *setup; break; } } ptr = snd_info_get_str(str, ptr, sizeof(str)); template.periods = simple_strtoul(str, NULL, 10); ptr = snd_info_get_str(str, ptr, sizeof(str)); template.period_size = simple_strtoul(str, NULL, 10); for (idx1 = 31; idx1 >= 0; idx1--) if (template.period_size & (1 << idx1)) break; for (idx1--; idx1 >= 0; idx1--) template.period_size &= ~(1 << idx1); do { ptr = snd_info_get_str(str, ptr, sizeof(str)); if (!strcmp(str, "disable")) { template.disable = 1; } else if (!strcmp(str, "direct")) { template.direct = 1; } else if (!strcmp(str, "block")) { template.block = 1; } else if (!strcmp(str, "non-block")) { template.nonblock = 1; } else if (!strcmp(str, "partial-frag")) { template.partialfrag = 1; } else if (!strcmp(str, "no-silence")) { template.nosilence = 1; } else if (!strcmp(str, "buggy-ptr")) { template.buggyptr = 1; } } while (*str); if (setup == NULL) { setup = kmalloc(sizeof(*setup), GFP_KERNEL); if (! setup) { buffer->error = -ENOMEM; return; } if (pstr->oss.setup_list == NULL) pstr->oss.setup_list = setup; else { for (setup1 = pstr->oss.setup_list; setup1->next; setup1 = setup1->next); setup1->next = setup; } template.task_name = kstrdup(task_name, GFP_KERNEL); if (! template.task_name) { kfree(setup); buffer->error = -ENOMEM; return; } } *setup = template; } } static void snd_pcm_oss_proc_init(struct snd_pcm *pcm) { int stream; for (stream = 0; stream < 2; ++stream) { struct snd_info_entry *entry; struct snd_pcm_str *pstr = &pcm->streams[stream]; if (pstr->substream_count == 0) continue; entry = snd_info_create_card_entry(pcm->card, "oss", pstr->proc_root); if (entry) { entry->content = SNDRV_INFO_CONTENT_TEXT; entry->mode = S_IFREG | 0644; entry->c.text.read = snd_pcm_oss_proc_read; entry->c.text.write = snd_pcm_oss_proc_write; entry->private_data = pstr; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); entry = NULL; } } pstr->oss.proc_entry = entry; } } static void snd_pcm_oss_proc_done(struct snd_pcm *pcm) { int stream; for (stream = 0; stream < 2; ++stream) { struct snd_pcm_str *pstr = &pcm->streams[stream]; snd_info_free_entry(pstr->oss.proc_entry); pstr->oss.proc_entry = NULL; snd_pcm_oss_proc_free_setup_list(pstr); } } #else /* !CONFIG_SND_VERBOSE_PROCFS */ static inline void snd_pcm_oss_proc_init(struct snd_pcm *pcm) { } static inline void snd_pcm_oss_proc_done(struct snd_pcm *pcm) { } #endif /* CONFIG_SND_VERBOSE_PROCFS */ /* * ENTRY functions */ static const struct file_operations snd_pcm_oss_f_reg = { .owner = THIS_MODULE, .read = snd_pcm_oss_read, .write = snd_pcm_oss_write, .open = snd_pcm_oss_open, .release = snd_pcm_oss_release, .poll = snd_pcm_oss_poll, .unlocked_ioctl = snd_pcm_oss_ioctl, .compat_ioctl = snd_pcm_oss_ioctl_compat, .mmap = snd_pcm_oss_mmap, }; static void register_oss_dsp(struct snd_pcm *pcm, int index) { if (snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM, pcm->card, index, &snd_pcm_oss_f_reg, pcm) < 0) { pcm_err(pcm, "unable to register OSS PCM device %i:%i\n", pcm->card->number, pcm->device); } } static int snd_pcm_oss_register_minor(struct snd_pcm *pcm) { pcm->oss.reg = 0; if (dsp_map[pcm->card->number] == (int)pcm->device) { char name[128]; int duplex; register_oss_dsp(pcm, 0); duplex = (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream_count > 0 && pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream_count && !(pcm->info_flags & SNDRV_PCM_INFO_HALF_DUPLEX)); sprintf(name, "%s%s", pcm->name, duplex ? " (DUPLEX)" : ""); #ifdef SNDRV_OSS_INFO_DEV_AUDIO snd_oss_info_register(SNDRV_OSS_INFO_DEV_AUDIO, pcm->card->number, name); #endif pcm->oss.reg++; pcm->oss.reg_mask |= 1; } if (adsp_map[pcm->card->number] == (int)pcm->device) { register_oss_dsp(pcm, 1); pcm->oss.reg++; pcm->oss.reg_mask |= 2; } if (pcm->oss.reg) snd_pcm_oss_proc_init(pcm); return 0; } static int snd_pcm_oss_disconnect_minor(struct snd_pcm *pcm) { if (pcm->oss.reg) { if (pcm->oss.reg_mask & 1) { pcm->oss.reg_mask &= ~1; snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM, pcm->card, 0); } if (pcm->oss.reg_mask & 2) { pcm->oss.reg_mask &= ~2; snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM, pcm->card, 1); } if (dsp_map[pcm->card->number] == (int)pcm->device) { #ifdef SNDRV_OSS_INFO_DEV_AUDIO snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_AUDIO, pcm->card->number); #endif } pcm->oss.reg = 0; } return 0; } static int snd_pcm_oss_unregister_minor(struct snd_pcm *pcm) { snd_pcm_oss_disconnect_minor(pcm); snd_pcm_oss_proc_done(pcm); return 0; } static struct snd_pcm_notify snd_pcm_oss_notify = { .n_register = snd_pcm_oss_register_minor, .n_disconnect = snd_pcm_oss_disconnect_minor, .n_unregister = snd_pcm_oss_unregister_minor, }; static int __init alsa_pcm_oss_init(void) { int i; int err; /* check device map table */ for (i = 0; i < SNDRV_CARDS; i++) { if (dsp_map[i] < 0 || dsp_map[i] >= SNDRV_PCM_DEVICES) { pr_err("ALSA: pcm_oss: invalid dsp_map[%d] = %d\n", i, dsp_map[i]); dsp_map[i] = 0; } if (adsp_map[i] < 0 || adsp_map[i] >= SNDRV_PCM_DEVICES) { pr_err("ALSA: pcm_oss: invalid adsp_map[%d] = %d\n", i, adsp_map[i]); adsp_map[i] = 1; } } err = snd_pcm_notify(&snd_pcm_oss_notify, 0); if (err < 0) return err; return 0; } static void __exit alsa_pcm_oss_exit(void) { snd_pcm_notify(&snd_pcm_oss_notify, 1); } module_init(alsa_pcm_oss_init) module_exit(alsa_pcm_oss_exit) |
| 2 1 1 5 1 1 1 2 1 4 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver for SCM Microsystems (a.k.a. Shuttle) USB-ATAPI cable * * Current development and maintenance by: * (c) 2000, 2001 Robert Baruch (autophile@starband.net) * (c) 2004, 2005 Daniel Drake <dsd@gentoo.org> * * Developed with the assistance of: * (c) 2002 Alan Stern <stern@rowland.org> * * Flash support based on earlier work by: * (c) 2002 Thomas Kreiling <usbdev@sm04.de> * * Many originally ATAPI devices were slightly modified to meet the USB * market by using some kind of translation from ATAPI to USB on the host, * and the peripheral would translate from USB back to ATAPI. * * SCM Microsystems (www.scmmicro.com) makes a device, sold to OEM's only, * which does the USB-to-ATAPI conversion. By obtaining the data sheet on * their device under nondisclosure agreement, I have been able to write * this driver for Linux. * * The chip used in the device can also be used for EPP and ISA translation * as well. This driver is only guaranteed to work with the ATAPI * translation. * * See the Kconfig help text for a list of devices known to be supported by * this driver. */ #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/cdrom.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "scsiglue.h" #define DRV_NAME "ums-usbat" MODULE_DESCRIPTION("Driver for SCM Microsystems (a.k.a. Shuttle) USB-ATAPI cable"); MODULE_AUTHOR("Daniel Drake <dsd@gentoo.org>, Robert Baruch <autophile@starband.net>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("USB_STORAGE"); /* Supported device types */ #define USBAT_DEV_HP8200 0x01 #define USBAT_DEV_FLASH 0x02 #define USBAT_EPP_PORT 0x10 #define USBAT_EPP_REGISTER 0x30 #define USBAT_ATA 0x40 #define USBAT_ISA 0x50 /* Commands (need to be logically OR'd with an access type */ #define USBAT_CMD_READ_REG 0x00 #define USBAT_CMD_WRITE_REG 0x01 #define USBAT_CMD_READ_BLOCK 0x02 #define USBAT_CMD_WRITE_BLOCK 0x03 #define USBAT_CMD_COND_READ_BLOCK 0x04 #define USBAT_CMD_COND_WRITE_BLOCK 0x05 #define USBAT_CMD_WRITE_REGS 0x07 /* Commands (these don't need an access type) */ #define USBAT_CMD_EXEC_CMD 0x80 #define USBAT_CMD_SET_FEAT 0x81 #define USBAT_CMD_UIO 0x82 /* Methods of accessing UIO register */ #define USBAT_UIO_READ 1 #define USBAT_UIO_WRITE 0 /* Qualifier bits */ #define USBAT_QUAL_FCQ 0x20 /* full compare */ #define USBAT_QUAL_ALQ 0x10 /* auto load subcount */ /* USBAT Flash Media status types */ #define USBAT_FLASH_MEDIA_NONE 0 #define USBAT_FLASH_MEDIA_CF 1 /* USBAT Flash Media change types */ #define USBAT_FLASH_MEDIA_SAME 0 #define USBAT_FLASH_MEDIA_CHANGED 1 /* USBAT ATA registers */ #define USBAT_ATA_DATA 0x10 /* read/write data (R/W) */ #define USBAT_ATA_FEATURES 0x11 /* set features (W) */ #define USBAT_ATA_ERROR 0x11 /* error (R) */ #define USBAT_ATA_SECCNT 0x12 /* sector count (R/W) */ #define USBAT_ATA_SECNUM 0x13 /* sector number (R/W) */ #define USBAT_ATA_LBA_ME 0x14 /* cylinder low (R/W) */ #define USBAT_ATA_LBA_HI 0x15 /* cylinder high (R/W) */ #define USBAT_ATA_DEVICE 0x16 /* head/device selection (R/W) */ #define USBAT_ATA_STATUS 0x17 /* device status (R) */ #define USBAT_ATA_CMD 0x17 /* device command (W) */ #define USBAT_ATA_ALTSTATUS 0x0E /* status (no clear IRQ) (R) */ /* USBAT User I/O Data registers */ #define USBAT_UIO_EPAD 0x80 /* Enable Peripheral Control Signals */ #define USBAT_UIO_CDT 0x40 /* Card Detect (Read Only) */ /* CDT = ACKD & !UI1 & !UI0 */ #define USBAT_UIO_1 0x20 /* I/O 1 */ #define USBAT_UIO_0 0x10 /* I/O 0 */ #define USBAT_UIO_EPP_ATA 0x08 /* 1=EPP mode, 0=ATA mode */ #define USBAT_UIO_UI1 0x04 /* Input 1 */ #define USBAT_UIO_UI0 0x02 /* Input 0 */ #define USBAT_UIO_INTR_ACK 0x01 /* Interrupt (ATA/ISA)/Acknowledge (EPP) */ /* USBAT User I/O Enable registers */ #define USBAT_UIO_DRVRST 0x80 /* Reset Peripheral */ #define USBAT_UIO_ACKD 0x40 /* Enable Card Detect */ #define USBAT_UIO_OE1 0x20 /* I/O 1 set=output/clr=input */ /* If ACKD=1, set OE1 to 1 also. */ #define USBAT_UIO_OE0 0x10 /* I/O 0 set=output/clr=input */ #define USBAT_UIO_ADPRST 0x01 /* Reset SCM chip */ /* USBAT Features */ #define USBAT_FEAT_ETEN 0x80 /* External trigger enable */ #define USBAT_FEAT_U1 0x08 #define USBAT_FEAT_U0 0x04 #define USBAT_FEAT_ET1 0x02 #define USBAT_FEAT_ET2 0x01 struct usbat_info { int devicetype; /* Used for Flash readers only */ unsigned long sectors; /* total sector count */ unsigned long ssize; /* sector size in bytes */ unsigned char sense_key; unsigned long sense_asc; /* additional sense code */ unsigned long sense_ascq; /* additional sense code qualifier */ }; #define short_pack(LSB,MSB) ( ((u16)(LSB)) | ( ((u16)(MSB))<<8 ) ) #define LSB_of(s) ((s)&0xFF) #define MSB_of(s) ((s)>>8) static int transferred = 0; static int usbat_flash_transport(struct scsi_cmnd * srb, struct us_data *us); static int usbat_hp8200e_transport(struct scsi_cmnd *srb, struct us_data *us); static int init_usbat_cd(struct us_data *us); static int init_usbat_flash(struct us_data *us); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static const struct usb_device_id usbat_usb_ids[] = { # include "unusual_usbat.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usbat_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static const struct us_unusual_dev usbat_unusual_dev_list[] = { # include "unusual_usbat.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV /* * Convenience function to produce an ATA read/write sectors command * Use cmd=0x20 for read, cmd=0x30 for write */ static void usbat_pack_ata_sector_cmd(unsigned char *buf, unsigned char thistime, u32 sector, unsigned char cmd) { buf[0] = 0; buf[1] = thistime; buf[2] = sector & 0xFF; buf[3] = (sector >> 8) & 0xFF; buf[4] = (sector >> 16) & 0xFF; buf[5] = 0xE0 | ((sector >> 24) & 0x0F); buf[6] = cmd; } /* * Convenience function to get the device type (flash or hp8200) */ static int usbat_get_device_type(struct us_data *us) { return ((struct usbat_info*)us->extra)->devicetype; } /* * Read a register from the device */ static int usbat_read(struct us_data *us, unsigned char access, unsigned char reg, unsigned char *content) { return usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, access | USBAT_CMD_READ_REG, 0xC0, (u16)reg, 0, content, 1); } /* * Write to a register on the device */ static int usbat_write(struct us_data *us, unsigned char access, unsigned char reg, unsigned char content) { return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, access | USBAT_CMD_WRITE_REG, 0x40, short_pack(reg, content), 0, NULL, 0); } /* * Convenience function to perform a bulk read */ static int usbat_bulk_read(struct us_data *us, void* buf, unsigned int len, int use_sg) { if (len == 0) return USB_STOR_XFER_GOOD; usb_stor_dbg(us, "len = %d\n", len); return usb_stor_bulk_transfer_sg(us, us->recv_bulk_pipe, buf, len, use_sg, NULL); } /* * Convenience function to perform a bulk write */ static int usbat_bulk_write(struct us_data *us, void* buf, unsigned int len, int use_sg) { if (len == 0) return USB_STOR_XFER_GOOD; usb_stor_dbg(us, "len = %d\n", len); return usb_stor_bulk_transfer_sg(us, us->send_bulk_pipe, buf, len, use_sg, NULL); } /* * Some USBAT-specific commands can only be executed over a command transport * This transport allows one (len=8) or two (len=16) vendor-specific commands * to be executed. */ static int usbat_execute_command(struct us_data *us, unsigned char *commands, unsigned int len) { return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, USBAT_CMD_EXEC_CMD, 0x40, 0, 0, commands, len); } /* * Read the status register */ static int usbat_get_status(struct us_data *us, unsigned char *status) { int rc; rc = usbat_read(us, USBAT_ATA, USBAT_ATA_STATUS, status); usb_stor_dbg(us, "0x%02X\n", *status); return rc; } /* * Check the device status */ static int usbat_check_status(struct us_data *us) { unsigned char *reply = us->iobuf; int rc; rc = usbat_get_status(us, reply); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; /* error/check condition (0x51 is ok) */ if (*reply & 0x01 && *reply != 0x51) return USB_STOR_TRANSPORT_FAILED; /* device fault */ if (*reply & 0x20) return USB_STOR_TRANSPORT_FAILED; return USB_STOR_TRANSPORT_GOOD; } /* * Stores critical information in internal registers in preparation for the execution * of a conditional usbat_read_blocks or usbat_write_blocks call. */ static int usbat_set_shuttle_features(struct us_data *us, unsigned char external_trigger, unsigned char epp_control, unsigned char mask_byte, unsigned char test_pattern, unsigned char subcountH, unsigned char subcountL) { unsigned char *command = us->iobuf; command[0] = 0x40; command[1] = USBAT_CMD_SET_FEAT; /* * The only bit relevant to ATA access is bit 6 * which defines 8 bit data access (set) or 16 bit (unset) */ command[2] = epp_control; /* * If FCQ is set in the qualifier (defined in R/W cmd), then bits U0, U1, * ET1 and ET2 define an external event to be checked for on event of a * _read_blocks or _write_blocks operation. The read/write will not take * place unless the defined trigger signal is active. */ command[3] = external_trigger; /* * The resultant byte of the mask operation (see mask_byte) is compared for * equivalence with this test pattern. If equal, the read/write will take * place. */ command[4] = test_pattern; /* * This value is logically ANDed with the status register field specified * in the read/write command. */ command[5] = mask_byte; /* * If ALQ is set in the qualifier, this field contains the address of the * registers where the byte count should be read for transferring the data. * If ALQ is not set, then this field contains the number of bytes to be * transferred. */ command[6] = subcountL; command[7] = subcountH; return usbat_execute_command(us, command, 8); } /* * Block, waiting for an ATA device to become not busy or to report * an error condition. */ static int usbat_wait_not_busy(struct us_data *us, int minutes) { int i; int result; unsigned char *status = us->iobuf; /* * Synchronizing cache on a CDR could take a heck of a long time, * but probably not more than 10 minutes or so. On the other hand, * doing a full blank on a CDRW at speed 1 will take about 75 * minutes! */ for (i=0; i<1200+minutes*60; i++) { result = usbat_get_status(us, status); if (result!=USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (*status & 0x01) { /* check condition */ result = usbat_read(us, USBAT_ATA, 0x10, status); return USB_STOR_TRANSPORT_FAILED; } if (*status & 0x20) /* device fault */ return USB_STOR_TRANSPORT_FAILED; if ((*status & 0x80)==0x00) { /* not busy */ usb_stor_dbg(us, "Waited not busy for %d steps\n", i); return USB_STOR_TRANSPORT_GOOD; } if (i<500) msleep(10); /* 5 seconds */ else if (i<700) msleep(50); /* 10 seconds */ else if (i<1200) msleep(100); /* 50 seconds */ else msleep(1000); /* X minutes */ } usb_stor_dbg(us, "Waited not busy for %d minutes, timing out\n", minutes); return USB_STOR_TRANSPORT_FAILED; } /* * Read block data from the data register */ static int usbat_read_block(struct us_data *us, void* buf, unsigned short len, int use_sg) { int result; unsigned char *command = us->iobuf; if (!len) return USB_STOR_TRANSPORT_GOOD; command[0] = 0xC0; command[1] = USBAT_ATA | USBAT_CMD_READ_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = 0; command[4] = 0; command[5] = 0; command[6] = LSB_of(len); command[7] = MSB_of(len); result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; result = usbat_bulk_read(us, buf, len, use_sg); return (result == USB_STOR_XFER_GOOD ? USB_STOR_TRANSPORT_GOOD : USB_STOR_TRANSPORT_ERROR); } /* * Write block data via the data register */ static int usbat_write_block(struct us_data *us, unsigned char access, void* buf, unsigned short len, int minutes, int use_sg) { int result; unsigned char *command = us->iobuf; if (!len) return USB_STOR_TRANSPORT_GOOD; command[0] = 0x40; command[1] = access | USBAT_CMD_WRITE_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = 0; command[4] = 0; command[5] = 0; command[6] = LSB_of(len); command[7] = MSB_of(len); result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; result = usbat_bulk_write(us, buf, len, use_sg); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return usbat_wait_not_busy(us, minutes); } /* * Process read and write requests */ static int usbat_hp8200e_rw_block_test(struct us_data *us, unsigned char access, unsigned char *registers, unsigned char *data_out, unsigned short num_registers, unsigned char data_reg, unsigned char status_reg, unsigned char timeout, unsigned char qualifier, int direction, void *buf, unsigned short len, int use_sg, int minutes) { int result; unsigned int pipe = (direction == DMA_FROM_DEVICE) ? us->recv_bulk_pipe : us->send_bulk_pipe; unsigned char *command = us->iobuf; int i, j; int cmdlen; unsigned char *data = us->iobuf; unsigned char *status = us->iobuf; BUG_ON(num_registers > US_IOBUF_SIZE/2); for (i=0; i<20; i++) { /* * The first time we send the full command, which consists * of downloading the SCSI command followed by downloading * the data via a write-and-test. Any other time we only * send the command to download the data -- the SCSI command * is still 'active' in some sense in the device. * * We're only going to try sending the data 10 times. After * that, we just return a failure. */ if (i==0) { cmdlen = 16; /* * Write to multiple registers * Not really sure the 0x07, 0x17, 0xfc, 0xe7 is * necessary here, but that's what came out of the * trace every single time. */ command[0] = 0x40; command[1] = access | USBAT_CMD_WRITE_REGS; command[2] = 0x07; command[3] = 0x17; command[4] = 0xFC; command[5] = 0xE7; command[6] = LSB_of(num_registers*2); command[7] = MSB_of(num_registers*2); } else cmdlen = 8; /* Conditionally read or write blocks */ command[cmdlen-8] = (direction==DMA_TO_DEVICE ? 0x40 : 0xC0); command[cmdlen-7] = access | (direction==DMA_TO_DEVICE ? USBAT_CMD_COND_WRITE_BLOCK : USBAT_CMD_COND_READ_BLOCK); command[cmdlen-6] = data_reg; command[cmdlen-5] = status_reg; command[cmdlen-4] = timeout; command[cmdlen-3] = qualifier; command[cmdlen-2] = LSB_of(len); command[cmdlen-1] = MSB_of(len); result = usbat_execute_command(us, command, cmdlen); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (i==0) { for (j=0; j<num_registers; j++) { data[j<<1] = registers[j]; data[1+(j<<1)] = data_out[j]; } result = usbat_bulk_write(us, data, num_registers*2, 0); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; } result = usb_stor_bulk_transfer_sg(us, pipe, buf, len, use_sg, NULL); /* * If we get a stall on the bulk download, we'll retry * the bulk download -- but not the SCSI command because * in some sense the SCSI command is still 'active' and * waiting for the data. Don't ask me why this should be; * I'm only following what the Windoze driver did. * * Note that a stall for the test-and-read/write command means * that the test failed. In this case we're testing to make * sure that the device is error-free * (i.e. bit 0 -- CHK -- of status is 0). The most likely * hypothesis is that the USBAT chip somehow knows what * the device will accept, but doesn't give the device any * data until all data is received. Thus, the device would * still be waiting for the first byte of data if a stall * occurs, even if the stall implies that some data was * transferred. */ if (result == USB_STOR_XFER_SHORT || result == USB_STOR_XFER_STALLED) { /* * If we're reading and we stalled, then clear * the bulk output pipe only the first time. */ if (direction==DMA_FROM_DEVICE && i==0) { if (usb_stor_clear_halt(us, us->send_bulk_pipe) < 0) return USB_STOR_TRANSPORT_ERROR; } /* * Read status: is the device angry, or just busy? */ result = usbat_read(us, USBAT_ATA, direction==DMA_TO_DEVICE ? USBAT_ATA_STATUS : USBAT_ATA_ALTSTATUS, status); if (result!=USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (*status & 0x01) /* check condition */ return USB_STOR_TRANSPORT_FAILED; if (*status & 0x20) /* device fault */ return USB_STOR_TRANSPORT_FAILED; usb_stor_dbg(us, "Redoing %s\n", str_write_read(direction == DMA_TO_DEVICE)); } else if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; else return usbat_wait_not_busy(us, minutes); } usb_stor_dbg(us, "Bummer! %s bulk data 20 times failed\n", direction == DMA_TO_DEVICE ? "Writing" : "Reading"); return USB_STOR_TRANSPORT_FAILED; } /* * Write to multiple registers: * Allows us to write specific data to any registers. The data to be written * gets packed in this sequence: reg0, data0, reg1, data1, ..., regN, dataN * which gets sent through bulk out. * Not designed for large transfers of data! */ static int usbat_multiple_write(struct us_data *us, unsigned char *registers, unsigned char *data_out, unsigned short num_registers) { int i, result; unsigned char *data = us->iobuf; unsigned char *command = us->iobuf; BUG_ON(num_registers > US_IOBUF_SIZE/2); /* Write to multiple registers, ATA access */ command[0] = 0x40; command[1] = USBAT_ATA | USBAT_CMD_WRITE_REGS; /* No relevance */ command[2] = 0; command[3] = 0; command[4] = 0; command[5] = 0; /* Number of bytes to be transferred (incl. addresses and data) */ command[6] = LSB_of(num_registers*2); command[7] = MSB_of(num_registers*2); /* The setup command */ result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* Create the reg/data, reg/data sequence */ for (i=0; i<num_registers; i++) { data[i<<1] = registers[i]; data[1+(i<<1)] = data_out[i]; } /* Send the data */ result = usbat_bulk_write(us, data, num_registers*2, 0); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_get_device_type(us) == USBAT_DEV_HP8200) return usbat_wait_not_busy(us, 0); else return USB_STOR_TRANSPORT_GOOD; } /* * Conditionally read blocks from device: * Allows us to read blocks from a specific data register, based upon the * condition that a status register can be successfully masked with a status * qualifier. If this condition is not initially met, the read will wait * up until a maximum amount of time has elapsed, as specified by timeout. * The read will start when the condition is met, otherwise the command aborts. * * The qualifier defined here is not the value that is masked, it defines * conditions for the write to take place. The actual masked qualifier (and * other related details) are defined beforehand with _set_shuttle_features(). */ static int usbat_read_blocks(struct us_data *us, void* buffer, int len, int use_sg) { int result; unsigned char *command = us->iobuf; command[0] = 0xC0; command[1] = USBAT_ATA | USBAT_CMD_COND_READ_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = USBAT_ATA_STATUS; command[4] = 0xFD; /* Timeout (ms); */ command[5] = USBAT_QUAL_FCQ; command[6] = LSB_of(len); command[7] = MSB_of(len); /* Multiple block read setup command */ result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; /* Read the blocks we just asked for */ result = usbat_bulk_read(us, buffer, len, use_sg); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; return USB_STOR_TRANSPORT_GOOD; } /* * Conditionally write blocks to device: * Allows us to write blocks to a specific data register, based upon the * condition that a status register can be successfully masked with a status * qualifier. If this condition is not initially met, the write will wait * up until a maximum amount of time has elapsed, as specified by timeout. * The read will start when the condition is met, otherwise the command aborts. * * The qualifier defined here is not the value that is masked, it defines * conditions for the write to take place. The actual masked qualifier (and * other related details) are defined beforehand with _set_shuttle_features(). */ static int usbat_write_blocks(struct us_data *us, void* buffer, int len, int use_sg) { int result; unsigned char *command = us->iobuf; command[0] = 0x40; command[1] = USBAT_ATA | USBAT_CMD_COND_WRITE_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = USBAT_ATA_STATUS; command[4] = 0xFD; /* Timeout (ms) */ command[5] = USBAT_QUAL_FCQ; command[6] = LSB_of(len); command[7] = MSB_of(len); /* Multiple block write setup command */ result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; /* Write the data */ result = usbat_bulk_write(us, buffer, len, use_sg); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; return USB_STOR_TRANSPORT_GOOD; } /* * Read the User IO register */ static int usbat_read_user_io(struct us_data *us, unsigned char *data_flags) { int result; result = usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, USBAT_CMD_UIO, 0xC0, 0, 0, data_flags, USBAT_UIO_READ); usb_stor_dbg(us, "UIO register reads %02X\n", *data_flags); return result; } /* * Write to the User IO register */ static int usbat_write_user_io(struct us_data *us, unsigned char enable_flags, unsigned char data_flags) { return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, USBAT_CMD_UIO, 0x40, short_pack(enable_flags, data_flags), 0, NULL, USBAT_UIO_WRITE); } /* * Reset the device * Often needed on media change. */ static int usbat_device_reset(struct us_data *us) { int rc; /* * Reset peripheral, enable peripheral control signals * (bring reset signal up) */ rc = usbat_write_user_io(us, USBAT_UIO_DRVRST | USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* * Enable peripheral control signals * (bring reset signal down) */ rc = usbat_write_user_io(us, USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return USB_STOR_TRANSPORT_GOOD; } /* * Enable card detect */ static int usbat_device_enable_cdt(struct us_data *us) { int rc; /* Enable peripheral control signals and card detect */ rc = usbat_write_user_io(us, USBAT_UIO_ACKD | USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return USB_STOR_TRANSPORT_GOOD; } /* * Determine if media is present. */ static int usbat_flash_check_media_present(struct us_data *us, unsigned char *uio) { if (*uio & USBAT_UIO_UI0) { usb_stor_dbg(us, "no media detected\n"); return USBAT_FLASH_MEDIA_NONE; } return USBAT_FLASH_MEDIA_CF; } /* * Determine if media has changed since last operation */ static int usbat_flash_check_media_changed(struct us_data *us, unsigned char *uio) { if (*uio & USBAT_UIO_0) { usb_stor_dbg(us, "media change detected\n"); return USBAT_FLASH_MEDIA_CHANGED; } return USBAT_FLASH_MEDIA_SAME; } /* * Check for media change / no media and handle the situation appropriately */ static int usbat_flash_check_media(struct us_data *us, struct usbat_info *info) { int rc; unsigned char *uio = us->iobuf; rc = usbat_read_user_io(us, uio); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* Check for media existence */ rc = usbat_flash_check_media_present(us, uio); if (rc == USBAT_FLASH_MEDIA_NONE) { info->sense_key = 0x02; info->sense_asc = 0x3A; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } /* Check for media change */ rc = usbat_flash_check_media_changed(us, uio); if (rc == USBAT_FLASH_MEDIA_CHANGED) { /* Reset and re-enable card detect */ rc = usbat_device_reset(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; rc = usbat_device_enable_cdt(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; msleep(50); rc = usbat_read_user_io(us, uio); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; info->sense_key = UNIT_ATTENTION; info->sense_asc = 0x28; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } return USB_STOR_TRANSPORT_GOOD; } /* * Determine whether we are controlling a flash-based reader/writer, * or a HP8200-based CD drive. * Sets transport functions as appropriate. */ static int usbat_identify_device(struct us_data *us, struct usbat_info *info) { int rc; unsigned char status; if (!us || !info) return USB_STOR_TRANSPORT_ERROR; rc = usbat_device_reset(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; msleep(500); /* * In attempt to distinguish between HP CDRW's and Flash readers, we now * execute the IDENTIFY PACKET DEVICE command. On ATA devices (i.e. flash * readers), this command should fail with error. On ATAPI devices (i.e. * CDROM drives), it should succeed. */ rc = usbat_write(us, USBAT_ATA, USBAT_ATA_CMD, 0xA1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; rc = usbat_get_status(us, &status); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* Check for error bit, or if the command 'fell through' */ if (status == 0xA1 || !(status & 0x01)) { /* Device is HP 8200 */ usb_stor_dbg(us, "Detected HP8200 CDRW\n"); info->devicetype = USBAT_DEV_HP8200; } else { /* Device is a CompactFlash reader/writer */ usb_stor_dbg(us, "Detected Flash reader/writer\n"); info->devicetype = USBAT_DEV_FLASH; } return USB_STOR_TRANSPORT_GOOD; } /* * Set the transport function based on the device type */ static int usbat_set_transport(struct us_data *us, struct usbat_info *info, int devicetype) { if (!info->devicetype) info->devicetype = devicetype; if (!info->devicetype) usbat_identify_device(us, info); switch (info->devicetype) { default: return USB_STOR_TRANSPORT_ERROR; case USBAT_DEV_HP8200: us->transport = usbat_hp8200e_transport; break; case USBAT_DEV_FLASH: us->transport = usbat_flash_transport; break; } return 0; } /* * Read the media capacity */ static int usbat_flash_get_sector_count(struct us_data *us, struct usbat_info *info) { unsigned char registers[3] = { USBAT_ATA_SECCNT, USBAT_ATA_DEVICE, USBAT_ATA_CMD, }; unsigned char command[3] = { 0x01, 0xA0, 0xEC }; unsigned char *reply; unsigned char status; int rc; if (!us || !info) return USB_STOR_TRANSPORT_ERROR; reply = kmalloc(512, GFP_NOIO); if (!reply) return USB_STOR_TRANSPORT_ERROR; /* ATA command : IDENTIFY DEVICE */ rc = usbat_multiple_write(us, registers, command, 3); if (rc != USB_STOR_XFER_GOOD) { usb_stor_dbg(us, "Gah! identify_device failed\n"); rc = USB_STOR_TRANSPORT_ERROR; goto leave; } /* Read device status */ if (usbat_get_status(us, &status) != USB_STOR_XFER_GOOD) { rc = USB_STOR_TRANSPORT_ERROR; goto leave; } msleep(100); /* Read the device identification data */ rc = usbat_read_block(us, reply, 512, 0); if (rc != USB_STOR_TRANSPORT_GOOD) goto leave; info->sectors = ((u32)(reply[117]) << 24) | ((u32)(reply[116]) << 16) | ((u32)(reply[115]) << 8) | ((u32)(reply[114]) ); rc = USB_STOR_TRANSPORT_GOOD; leave: kfree(reply); return rc; } /* * Read data from device */ static int usbat_flash_read_data(struct us_data *us, struct usbat_info *info, u32 sector, u32 sectors) { unsigned char registers[7] = { USBAT_ATA_FEATURES, USBAT_ATA_SECCNT, USBAT_ATA_SECNUM, USBAT_ATA_LBA_ME, USBAT_ATA_LBA_HI, USBAT_ATA_DEVICE, USBAT_ATA_STATUS, }; unsigned char command[7]; unsigned char *buffer; unsigned char thistime; unsigned int totallen, alloclen; int len, result; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; result = usbat_flash_check_media(us, info); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* * we're working in LBA mode. according to the ATA spec, * we can support up to 28-bit addressing. I don't know if Jumpshot * supports beyond 24-bit addressing. It's kind of hard to test * since it requires > 8GB CF card. */ if (sector > 0x0FFFFFFF) return USB_STOR_TRANSPORT_ERROR; totallen = sectors * info->ssize; /* * Since we don't read more than 64 KB at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. */ alloclen = min(totallen, 65536u); buffer = kmalloc(alloclen, GFP_NOIO); if (buffer == NULL) return USB_STOR_TRANSPORT_ERROR; do { /* * loop, never allocate or transfer more than 64k at once * (min(128k, 255*info->ssize) is the real limit) */ len = min(totallen, alloclen); thistime = (len / info->ssize) & 0xff; /* ATA command 0x20 (READ SECTORS) */ usbat_pack_ata_sector_cmd(command, thistime, sector, 0x20); /* Write/execute ATA read command */ result = usbat_multiple_write(us, registers, command, 7); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; /* Read the data we just requested */ result = usbat_read_blocks(us, buffer, len, 0); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; usb_stor_dbg(us, "%d bytes\n", len); /* Store the data in the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &sg_offset, TO_XFER_BUF); sector += thistime; totallen -= len; } while (totallen > 0); kfree(buffer); return USB_STOR_TRANSPORT_GOOD; leave: kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } /* * Write data to device */ static int usbat_flash_write_data(struct us_data *us, struct usbat_info *info, u32 sector, u32 sectors) { unsigned char registers[7] = { USBAT_ATA_FEATURES, USBAT_ATA_SECCNT, USBAT_ATA_SECNUM, USBAT_ATA_LBA_ME, USBAT_ATA_LBA_HI, USBAT_ATA_DEVICE, USBAT_ATA_STATUS, }; unsigned char command[7]; unsigned char *buffer; unsigned char thistime; unsigned int totallen, alloclen; int len, result; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; result = usbat_flash_check_media(us, info); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* * we're working in LBA mode. according to the ATA spec, * we can support up to 28-bit addressing. I don't know if the device * supports beyond 24-bit addressing. It's kind of hard to test * since it requires > 8GB media. */ if (sector > 0x0FFFFFFF) return USB_STOR_TRANSPORT_ERROR; totallen = sectors * info->ssize; /* * Since we don't write more than 64 KB at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. */ alloclen = min(totallen, 65536u); buffer = kmalloc(alloclen, GFP_NOIO); if (buffer == NULL) return USB_STOR_TRANSPORT_ERROR; do { /* * loop, never allocate or transfer more than 64k at once * (min(128k, 255*info->ssize) is the real limit) */ len = min(totallen, alloclen); thistime = (len / info->ssize) & 0xff; /* Get the data from the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &sg_offset, FROM_XFER_BUF); /* ATA command 0x30 (WRITE SECTORS) */ usbat_pack_ata_sector_cmd(command, thistime, sector, 0x30); /* Write/execute ATA write command */ result = usbat_multiple_write(us, registers, command, 7); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; /* Write the data */ result = usbat_write_blocks(us, buffer, len, 0); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; sector += thistime; totallen -= len; } while (totallen > 0); kfree(buffer); return result; leave: kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } /* * Squeeze a potentially huge (> 65535 byte) read10 command into * a little ( <= 65535 byte) ATAPI pipe */ static int usbat_hp8200e_handle_read10(struct us_data *us, unsigned char *registers, unsigned char *data, struct scsi_cmnd *srb) { int result = USB_STOR_TRANSPORT_GOOD; unsigned char *buffer; unsigned int len; unsigned int sector; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; usb_stor_dbg(us, "transfersize %d\n", srb->transfersize); if (scsi_bufflen(srb) < 0x10000) { result = usbat_hp8200e_rw_block_test(us, USBAT_ATA, registers, data, 19, USBAT_ATA_DATA, USBAT_ATA_STATUS, 0xFD, (USBAT_QUAL_FCQ | USBAT_QUAL_ALQ), DMA_FROM_DEVICE, scsi_sglist(srb), scsi_bufflen(srb), scsi_sg_count(srb), 1); return result; } /* * Since we're requesting more data than we can handle in * a single read command (max is 64k-1), we will perform * multiple reads, but each read must be in multiples of * a sector. Luckily the sector size is in srb->transfersize * (see linux/drivers/scsi/sr.c). */ if (data[7+0] == GPCMD_READ_CD) { len = short_pack(data[7+9], data[7+8]); len <<= 16; len |= data[7+7]; usb_stor_dbg(us, "GPCMD_READ_CD: len %d\n", len); srb->transfersize = scsi_bufflen(srb)/len; } if (!srb->transfersize) { srb->transfersize = 2048; /* A guess */ usb_stor_dbg(us, "transfersize 0, forcing %d\n", srb->transfersize); } /* * Since we only read in one block at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. */ len = (65535/srb->transfersize) * srb->transfersize; usb_stor_dbg(us, "Max read is %d bytes\n", len); len = min(len, scsi_bufflen(srb)); buffer = kmalloc(len, GFP_NOIO); if (buffer == NULL) /* bloody hell! */ return USB_STOR_TRANSPORT_FAILED; sector = short_pack(data[7+3], data[7+2]); sector <<= 16; sector |= short_pack(data[7+5], data[7+4]); transferred = 0; while (transferred != scsi_bufflen(srb)) { if (len > scsi_bufflen(srb) - transferred) len = scsi_bufflen(srb) - transferred; data[3] = len&0xFF; /* (cylL) = expected length (L) */ data[4] = (len>>8)&0xFF; /* (cylH) = expected length (H) */ /* Fix up the SCSI command sector and num sectors */ data[7+2] = MSB_of(sector>>16); /* SCSI command sector */ data[7+3] = LSB_of(sector>>16); data[7+4] = MSB_of(sector&0xFFFF); data[7+5] = LSB_of(sector&0xFFFF); if (data[7+0] == GPCMD_READ_CD) data[7+6] = 0; data[7+7] = MSB_of(len / srb->transfersize); /* SCSI command */ data[7+8] = LSB_of(len / srb->transfersize); /* num sectors */ result = usbat_hp8200e_rw_block_test(us, USBAT_ATA, registers, data, 19, USBAT_ATA_DATA, USBAT_ATA_STATUS, 0xFD, (USBAT_QUAL_FCQ | USBAT_QUAL_ALQ), DMA_FROM_DEVICE, buffer, len, 0, 1); if (result != USB_STOR_TRANSPORT_GOOD) break; /* Store the data in the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, srb, &sg, &sg_offset, TO_XFER_BUF); /* Update the amount transferred and the sector number */ transferred += len; sector += len / srb->transfersize; } /* while transferred != scsi_bufflen(srb) */ kfree(buffer); return result; } static int usbat_select_and_test_registers(struct us_data *us) { int selector; unsigned char *status = us->iobuf; /* try device = master, then device = slave. */ for (selector = 0xA0; selector <= 0xB0; selector += 0x10) { if (usbat_write(us, USBAT_ATA, USBAT_ATA_DEVICE, selector) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_STATUS, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_DEVICE, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_HI, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_write(us, USBAT_ATA, USBAT_ATA_LBA_ME, 0x55) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_write(us, USBAT_ATA, USBAT_ATA_LBA_HI, 0xAA) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; } return USB_STOR_TRANSPORT_GOOD; } /* * Initialize the USBAT processor and the storage device */ static int init_usbat(struct us_data *us, int devicetype) { int rc; struct usbat_info *info; unsigned char subcountH = USBAT_ATA_LBA_HI; unsigned char subcountL = USBAT_ATA_LBA_ME; unsigned char *status = us->iobuf; us->extra = kzalloc(sizeof(struct usbat_info), GFP_NOIO); if (!us->extra) return -ENOMEM; info = (struct usbat_info *) (us->extra); /* Enable peripheral control signals */ rc = usbat_write_user_io(us, USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 1\n"); msleep(2000); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 2\n"); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 3\n"); rc = usbat_select_and_test_registers(us); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 4\n"); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 5\n"); /* Enable peripheral control signals and card detect */ rc = usbat_device_enable_cdt(us); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 6\n"); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 7\n"); msleep(1400); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 8\n"); rc = usbat_select_and_test_registers(us); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 9\n"); /* At this point, we need to detect which device we are using */ if (usbat_set_transport(us, info, devicetype)) return -EIO; usb_stor_dbg(us, "INIT 10\n"); if (usbat_get_device_type(us) == USBAT_DEV_FLASH) { subcountH = 0x02; subcountL = 0x00; } rc = usbat_set_shuttle_features(us, (USBAT_FEAT_ETEN | USBAT_FEAT_ET2 | USBAT_FEAT_ET1), 0x00, 0x88, 0x08, subcountH, subcountL); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 11\n"); return 0; } /* * Transport for the HP 8200e */ static int usbat_hp8200e_transport(struct scsi_cmnd *srb, struct us_data *us) { int result; unsigned char *status = us->iobuf; unsigned char registers[32]; unsigned char data[32]; unsigned int len; int i; len = scsi_bufflen(srb); /* * Send A0 (ATA PACKET COMMAND). * Note: I guess we're never going to get any of the ATA * commands... just ATA Packet Commands. */ registers[0] = USBAT_ATA_FEATURES; registers[1] = USBAT_ATA_SECCNT; registers[2] = USBAT_ATA_SECNUM; registers[3] = USBAT_ATA_LBA_ME; registers[4] = USBAT_ATA_LBA_HI; registers[5] = USBAT_ATA_DEVICE; registers[6] = USBAT_ATA_CMD; data[0] = 0x00; data[1] = 0x00; data[2] = 0x00; data[3] = len&0xFF; /* (cylL) = expected length (L) */ data[4] = (len>>8)&0xFF; /* (cylH) = expected length (H) */ data[5] = 0xB0; /* (device sel) = slave */ data[6] = 0xA0; /* (command) = ATA PACKET COMMAND */ for (i=7; i<19; i++) { registers[i] = 0x10; data[i] = (i-7 >= srb->cmd_len) ? 0 : srb->cmnd[i-7]; } result = usbat_get_status(us, status); usb_stor_dbg(us, "Status = %02X\n", *status); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (srb->cmnd[0] == TEST_UNIT_READY) transferred = 0; if (srb->sc_data_direction == DMA_TO_DEVICE) { result = usbat_hp8200e_rw_block_test(us, USBAT_ATA, registers, data, 19, USBAT_ATA_DATA, USBAT_ATA_STATUS, 0xFD, (USBAT_QUAL_FCQ | USBAT_QUAL_ALQ), DMA_TO_DEVICE, scsi_sglist(srb), len, scsi_sg_count(srb), 10); if (result == USB_STOR_TRANSPORT_GOOD) { transferred += len; usb_stor_dbg(us, "Wrote %08X bytes\n", transferred); } return result; } else if (srb->cmnd[0] == READ_10 || srb->cmnd[0] == GPCMD_READ_CD) { return usbat_hp8200e_handle_read10(us, registers, data, srb); } if (len > 0xFFFF) { usb_stor_dbg(us, "Error: len = %08X... what do I do now?\n", len); return USB_STOR_TRANSPORT_ERROR; } result = usbat_multiple_write(us, registers, data, 7); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* * Write the 12-byte command header. * * If the command is BLANK then set the timer for 75 minutes. * Otherwise set it for 10 minutes. * * NOTE: THE 8200 DOCUMENTATION STATES THAT BLANKING A CDRW * AT SPEED 4 IS UNRELIABLE!!! */ result = usbat_write_block(us, USBAT_ATA, srb->cmnd, 12, srb->cmnd[0] == GPCMD_BLANK ? 75 : 10, 0); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* If there is response data to be read in then do it here. */ if (len != 0 && (srb->sc_data_direction == DMA_FROM_DEVICE)) { /* How many bytes to read in? Check cylL register */ if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) { return USB_STOR_TRANSPORT_ERROR; } if (len > 0xFF) { /* need to read cylH also */ len = *status; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_HI, status) != USB_STOR_XFER_GOOD) { return USB_STOR_TRANSPORT_ERROR; } len += ((unsigned int) *status)<<8; } else len = *status; result = usbat_read_block(us, scsi_sglist(srb), len, scsi_sg_count(srb)); } return result; } /* * Transport for USBAT02-based CompactFlash and similar storage devices */ static int usbat_flash_transport(struct scsi_cmnd * srb, struct us_data *us) { int rc; struct usbat_info *info = (struct usbat_info *) (us->extra); unsigned long block, blocks; unsigned char *ptr = us->iobuf; static const unsigned char inquiry_response[36] = { 0x00, 0x80, 0x00, 0x01, 0x1F, 0x00, 0x00, 0x00 }; if (srb->cmnd[0] == INQUIRY) { usb_stor_dbg(us, "INQUIRY - Returning bogus response\n"); memcpy(ptr, inquiry_response, sizeof(inquiry_response)); fill_inquiry_response(us, ptr, 36); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == READ_CAPACITY) { rc = usbat_flash_check_media(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; rc = usbat_flash_get_sector_count(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; /* hard coded 512 byte sectors as per ATA spec */ info->ssize = 0x200; usb_stor_dbg(us, "READ_CAPACITY: %ld sectors, %ld bytes per sector\n", info->sectors, info->ssize); /* * build the reply * note: must return the sector number of the last sector, * *not* the total number of sectors */ ((__be32 *) ptr)[0] = cpu_to_be32(info->sectors - 1); ((__be32 *) ptr)[1] = cpu_to_be32(info->ssize); usb_stor_set_xfer_buf(ptr, 8, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == MODE_SELECT_10) { usb_stor_dbg(us, "Gah! MODE_SELECT_10\n"); return USB_STOR_TRANSPORT_ERROR; } if (srb->cmnd[0] == READ_10) { block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[7]) << 8) | ((u32)(srb->cmnd[8])); usb_stor_dbg(us, "READ_10: read block 0x%04lx count %ld\n", block, blocks); return usbat_flash_read_data(us, info, block, blocks); } if (srb->cmnd[0] == READ_12) { /* * I don't think we'll ever see a READ_12 but support it anyway */ block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[6]) << 24) | ((u32)(srb->cmnd[7]) << 16) | ((u32)(srb->cmnd[8]) << 8) | ((u32)(srb->cmnd[9])); usb_stor_dbg(us, "READ_12: read block 0x%04lx count %ld\n", block, blocks); return usbat_flash_read_data(us, info, block, blocks); } if (srb->cmnd[0] == WRITE_10) { block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[7]) << 8) | ((u32)(srb->cmnd[8])); usb_stor_dbg(us, "WRITE_10: write block 0x%04lx count %ld\n", block, blocks); return usbat_flash_write_data(us, info, block, blocks); } if (srb->cmnd[0] == WRITE_12) { /* * I don't think we'll ever see a WRITE_12 but support it anyway */ block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[6]) << 24) | ((u32)(srb->cmnd[7]) << 16) | ((u32)(srb->cmnd[8]) << 8) | ((u32)(srb->cmnd[9])); usb_stor_dbg(us, "WRITE_12: write block 0x%04lx count %ld\n", block, blocks); return usbat_flash_write_data(us, info, block, blocks); } if (srb->cmnd[0] == TEST_UNIT_READY) { usb_stor_dbg(us, "TEST_UNIT_READY\n"); rc = usbat_flash_check_media(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; return usbat_check_status(us); } if (srb->cmnd[0] == REQUEST_SENSE) { usb_stor_dbg(us, "REQUEST_SENSE\n"); memset(ptr, 0, 18); ptr[0] = 0xF0; ptr[2] = info->sense_key; ptr[7] = 11; ptr[12] = info->sense_asc; ptr[13] = info->sense_ascq; usb_stor_set_xfer_buf(ptr, 18, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) { /* * sure. whatever. not like we can stop the user from popping * the media out of the device (no locking doors, etc) */ return USB_STOR_TRANSPORT_GOOD; } usb_stor_dbg(us, "Gah! Unknown command: %d (0x%x)\n", srb->cmnd[0], srb->cmnd[0]); info->sense_key = 0x05; info->sense_asc = 0x20; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } static int init_usbat_cd(struct us_data *us) { return init_usbat(us, USBAT_DEV_HP8200); } static int init_usbat_flash(struct us_data *us) { return init_usbat(us, USBAT_DEV_FLASH); } static struct scsi_host_template usbat_host_template; static int usbat_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; result = usb_stor_probe1(&us, intf, id, (id - usbat_usb_ids) + usbat_unusual_dev_list, &usbat_host_template); if (result) return result; /* * The actual transport will be determined later by the * initialization routine; this is just a placeholder. */ us->transport_name = "Shuttle USBAT"; us->transport = usbat_flash_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 0; result = usb_stor_probe2(us); return result; } static struct usb_driver usbat_driver = { .name = DRV_NAME, .probe = usbat_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = usbat_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(usbat_driver, usbat_host_template, DRV_NAME); |
| 15 13 1 2 2 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IP6 tables REJECT target module * Linux INET6 implementation * * Copyright (C)2003 USAGI/WIDE Project * * Authors: * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> * * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net> * * Based on net/ipv4/netfilter/ipt_REJECT.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/gfp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netdevice.h> #include <net/icmp.h> #include <net/flow.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_REJECT.h> #include <net/netfilter/ipv6/nf_reject.h> MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; struct net *net = xt_net(par); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par)); break; case IP6T_ICMP6_ADM_PROHIBITED: nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, xt_hooknum(par)); break; case IP6T_ICMP6_NOT_NEIGHBOUR: nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, xt_hooknum(par)); break; case IP6T_ICMP6_ADDR_UNREACH: nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, xt_hooknum(par)); break; case IP6T_ICMP6_PORT_UNREACH: nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, xt_hooknum(par)); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ break; case IP6T_TCP_RESET: nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par)); break; case IP6T_ICMP6_POLICY_FAIL: nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par)); break; case IP6T_ICMP6_REJECT_ROUTE: nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, xt_hooknum(par)); break; } return NF_DROP; } static int reject_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_reject_info *rejinfo = par->targinfo; const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { pr_info_ratelimited("ECHOREPLY is not supported\n"); return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || (e->ipv6.invflags & XT_INV_PROTO)) { pr_info_ratelimited("TCP_RESET illegal for non-tcp\n"); return -EINVAL; } } return 0; } static struct xt_target reject_tg6_reg __read_mostly = { .name = "REJECT", .family = NFPROTO_IPV6, .target = reject_tg6, .targetsize = sizeof(struct ip6t_reject_info), .table = "filter", .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT), .checkentry = reject_tg6_check, .me = THIS_MODULE }; static int __init reject_tg6_init(void) { return xt_register_target(&reject_tg6_reg); } static void __exit reject_tg6_exit(void) { xt_unregister_target(&reject_tg6_reg); } module_init(reject_tg6_init); module_exit(reject_tg6_exit); |
| 6396 2207 6386 7197 7087 7086 6447 6265 6286 164 164 4786 4811 360 361 329 330 3373 3 3390 291 290 1475 22 14 1441 63 62 51 1553 1553 1553 1552 674 929 114 114 10 11 10 222 221 378 4879 4880 1 1 1 173 173 172 83 4 4 4 1779 1776 17 1 17 56 138 138 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/sysctl.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/random.h> #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> #include <linux/fsnotify.h> #include <linux/page_idle.h> #include <linux/uaccess.h> #include <kunit/visibility.h> #include "internal.h" #include "swap.h" /** * kfree_const - conditionally free memory * @x: pointer to the memory * * Function calls kfree only if @x is not in .rodata section. */ void kfree_const(const void *x) { if (!is_kernel_rodata((unsigned long)x)) kfree(x); } EXPORT_SYMBOL(kfree_const); /** * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. * @s: The data to copy * @len: The size of the data, not including the NUL terminator * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) { char *buf; /* '+1' for the NUL terminator */ buf = kmalloc_track_caller(len + 1, gfp); if (!buf) return NULL; memcpy(buf, s, len); /* Ensure the buf is always NUL-terminated, regardless of @s. */ buf[len] = '\0'; return buf; } /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s or %NULL in case of error */ noinline char *kstrdup(const char *s, gfp_t gfp) { return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; } EXPORT_SYMBOL(kstrdup); /** * kstrdup_const - conditionally duplicate an existing const string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Strings allocated by kstrdup_const should be freed by kfree_const and * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return kstrdup(s, gfp); } EXPORT_SYMBOL(kstrdup_const); /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; } EXPORT_SYMBOL(kstrndup); /** * kmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kmemdup_noprof); /** * kmemdup_array - duplicate a given array. * * @src: array to duplicate. * @count: number of elements to duplicate from array. * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } EXPORT_SYMBOL(kmemdup_array); /** * kvmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result may be not physically contiguous. Use kvfree() to free. */ void *kvmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kvmalloc(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kvmemdup); /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { return s ? __kmemdup_nul(s, len, gfp) : NULL; } EXPORT_SYMBOL(kmemdup_nul); static kmem_buckets *user_buckets __ro_after_init; static int __init init_user_buckets(void) { user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); return 0; } subsys_initcall(init_user_buckets); /** * memdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(memdup_user); /** * vmemdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) { void *p; p = kmem_buckets_valloc(user_buckets, len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(vmemdup_user); /** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { char *p; long length; length = strnlen_user(s, n); if (!length) return ERR_PTR(-EFAULT); if (length > n) return ERR_PTR(-EINVAL); p = memdup_user(s, length); if (IS_ERR(p)) return p; p[length - 1] = '\0'; return p; } EXPORT_SYMBOL(strndup_user); /** * memdup_user_nul - duplicate memory region from user space and NUL-terminate * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { char *p; p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(const struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } /* * Change backing file, only valid to use during initial VMA setup. */ void vma_set_file(struct vm_area_struct *vma, struct file *file) { /* Changing an anonymous vma with this is illegal */ get_file(file); swap(vma->vm_file, file); fput(file); } EXPORT_SYMBOL(vma_set_file); #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP return PAGE_ALIGN(stack_top) + random_variable; #else return PAGE_ALIGN(stack_top) - random_variable; #endif } /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. * @range: The size of the area, starting at @start, within which the * random address must fall. * * If @start + @range would overflow, @range is capped. * * NOTE: Historical use of randomize_range, which this replaces, presumed that * @start was already page aligned. We now align it regardless. * * Return: A page aligned address within [start, start + range). On error, * @start is returned. */ unsigned long randomize_page(unsigned long start, unsigned long range) { if (!PAGE_ALIGNED(start)) { range -= PAGE_ALIGN(start) - start; start = PAGE_ALIGN(start); } if (start > ULONG_MAX - range) range = ULONG_MAX - start; range >>= PAGE_SHIFT; if (range == 0) return start; return start + (get_random_long() % range << PAGE_SHIFT); } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } unsigned long arch_mmap_rnd(void) { unsigned long rnd; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } static int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; /* On parisc the stack always grows up - so a unlimited stack should * not be an indicator to use the legacy memory layout. */ if (rlim_stack->rlim_cur == RLIM_INFINITY && !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; } /* * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* * For an upwards growing stack the calculation is much simpler. * Memory for the maximum stack size is reserved at the top of the * task. mmap_base starts directly below the stack and grows * downwards. */ return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); #else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; /* Account for stack randomization if necessary */ if (current->flags & PF_RANDOMIZE) pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; if (gap < MIN_GAP && MIN_GAP < MAX_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); #endif } void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); #endif /** * __account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * @task: task used to check RLIMIT_MEMLOCK * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and * that mmap_lock is held as writer. * * Return: * * 0 on success * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, const struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { if (!bypass_rlim) { limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked_vm + pages > limit) ret = -ENOMEM; } if (!ret) mm->locked_vm = locked_vm + pages; } else { WARN_ON_ONCE(pages > locked_vm); mm->locked_vm = locked_vm - pages; } pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } EXPORT_SYMBOL_GPL(__account_locked_vm); /** * account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against, may be NULL * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * * Return: * * 0 on success, or if mm is NULL * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { int ret; if (pages == 0 || !mm) return 0; mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { loff_t off = (loff_t)pgoff << PAGE_SHIFT; unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) ret = fsnotify_mmap_perm(file, prot, off, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } /* * Perform a userland memory mapping into the current process address space. See * the comment for do_mmap() for more details on this operation in general. * * This differs from do_mmap() in that: * * a. An offset parameter is provided rather than pgoff, which is both checked * for overflow and page alignment. * b. mmap locking is performed on the caller's behalf. * c. Userfaultfd unmap events and memory population are handled. * * This means that this function performs essentially the same work as if * userland were invoking mmap (2). * * Returns either an error, or the address at which the requested mapping has * been performed. */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return __vmalloc_noprof(bytes, flags); } EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); } EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) return NULL; return (void *)(mapping - FOLIO_MAPPING_ANON); } /** * folio_mapping - Find the mapping where this folio is stored. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Folios in the swap cache return the swap mapping * this page is stored in (which is different from the mapping for the * swap file or swap device where the data is stored). * * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ struct address_space *folio_mapping(const struct folio *folio) { struct address_space *mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(folio_test_slab(folio))) return NULL; if (unlikely(folio_test_swapcache(folio))) return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) return NULL; return mapping; } EXPORT_SYMBOL(folio_mapping); /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. * @src: Folio to copy from. * * The bytes in the folio represented by @src are copied to @dst. * Assumes the caller has validated that @dst is at least as large as @src. * Can be called in atomic context for order-0 folios, but if the folio is * larger, it may sleep. */ void folio_copy(struct folio *dst, struct folio *src) { long i = 0; long nr = folio_nr_pages(src); for (;;) { copy_highpage(folio_page(dst, i), folio_page(src, i)); if (++i == nr) break; cond_resched(); } } EXPORT_SYMBOL(folio_copy); int folio_mc_copy(struct folio *dst, struct folio *src) { long nr = folio_nr_pages(src); long i = 0; for (;;) { if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) return -EHWPOISON; if (++i == nr) break; cond_resched(); } return 0; } EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; static int sysctl_overcommit_ratio __read_mostly = 50; static unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ #ifdef CONFIG_SYSCTL static int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_kbytes = 0; return ret; } static void sync_overcommit_as(struct work_struct *dummy) { percpu_counter_sync(&vm_committed_as); } static int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch * 2. sync percpu count on each CPU * 3. switch the policy */ if (write) { t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); if (new_policy == OVERCOMMIT_NEVER) schedule_on_each_cpu(sync_overcommit_as); sysctl_overcommit_memory = new_policy; } else { ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } return ret; } static int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_ratio = 0; return ret; } static const struct ctl_table util_sysctl_table[] = { { .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, .proc_handler = overcommit_ratio_handler, }, { .procname = "overcommit_kbytes", .data = &sysctl_overcommit_kbytes, .maxlen = sizeof(sysctl_overcommit_kbytes), .mode = 0644, .proc_handler = overcommit_kbytes_handler, }, { .procname = "user_reserve_kbytes", .data = &sysctl_user_reserve_kbytes, .maxlen = sizeof(sysctl_user_reserve_kbytes), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "admin_reserve_kbytes", .data = &sysctl_admin_reserve_kbytes, .maxlen = sizeof(sysctl_admin_reserve_kbytes), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; static int __init init_vm_util_sysctls(void) { register_sysctl_init("vm", util_sysctl_table); return 0; } subsys_initcall(init_vm_util_sysctls); #endif /* CONFIG_SYSCTL */ /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { unsigned long allowed; if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; return allowed; } /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * The global memory commitment made in the system can be a metric * that can be used to drive ballooning decisions when Linux is hosted * as a guest. On Hyper-V, the host implements a policy engine for dynamically * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. * * The time cost of this is very low for small platforms, and for big * platform like a 2S/36C/72T Skylake server, in worst case where * vm_committed_as's spinlock is under severe contention, the time cost * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; } allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); /* * Don't let a single process grow so big a user can't recover */ if (mm) { long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: bytes_failed = pages << PAGE_SHIFT; pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", __func__, current->pid, current->comm, bytes_failed); vm_unacct_memory(pages); return -ENOMEM; } /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. * * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) { int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); len = arg_end - arg_start; if (len > buflen) len = buflen; res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then * assume application is using setproctitle(3). */ if (res > 0 && buffer[res-1] != '\0' && len < buflen) { len = strnlen(buffer, res); if (len < res) { res = len; } else { len = env_end - env_start; if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, buffer+res, len, FOLL_FORCE); res = strnlen(buffer, res); } } out_mm: mmput(mm); out: return res; } int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; addr1 = kmap_local_page(page1); addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); kunmap_local(addr2); kunmap_local(addr1); return ret; } #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. * * This function uses pr_cont(), so that the caller is expected to have * printed out whatever preamble is appropriate. The provenance information * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation * and last free path of that object. */ void mem_dump_obj(void *object) { const char *type; if (kmem_dump_obj(object)) return; if (vmalloc_dump_obj(object)) return; if (is_vmalloc_addr(object)) type = "vmalloc memory"; else if (virt_addr_valid(object)) type = "non-slab/vmalloc memory"; else if (object == NULL) type = "NULL pointer"; else if (object == ZERO_SIZE_PTR) type = "zero-size pointer"; else type = "non-paged memory"; pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif /* * A driver might set a page logically offline -- PageOffline() -- and * turn the page inaccessible in the hypervisor; after that, access to page * content can be fatal. * * Some special PFN walkers -- i.e., /proc/kcore -- read content of random * pages after checking PageOffline(); however, these PFN walkers can race * with drivers that set PageOffline(). * * page_offline_freeze()/page_offline_thaw() allows for a subsystem to * synchronize with such drivers, achieving that a page cannot be set * PageOffline() while frozen. * * page_offline_begin()/page_offline_end() is used by drivers that care about * such races when setting a page PageOffline(). */ static DECLARE_RWSEM(page_offline_rwsem); void page_offline_freeze(void) { down_read(&page_offline_rwsem); } void page_offline_thaw(void) { up_read(&page_offline_rwsem); } void page_offline_begin(void) { down_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_begin); void page_offline_end(void) { up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); #ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) flush_dcache_page(folio_page(folio, i)); } EXPORT_SYMBOL(flush_dcache_folio); #endif /** * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() * for details. This is the same operation, only with a specific file operations * struct which may or may not be the same as vma->vm_file->f_op. * @f_op: The file operations whose .mmap_prepare() hook is specified. * @file: The file which backs or will back the mapping. * @vma: The VMA to apply the .mmap_prepare() hook to. * Returns: 0 on success or error. */ int __compat_vma_mmap_prepare(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, .file = file, .start = vma->vm_start, .end = vma->vm_end, .pgoff = vma->vm_pgoff, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); return 0; } EXPORT_SYMBOL(__compat_vma_mmap_prepare); /** * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an * existing VMA. * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain * stacked filesystems invoke a nested mmap hook of an underlying file. * * Until all filesystems are converted to use .mmap_prepare(), we must be * conservative and continue to invoke these stacked filesystems using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an * .mmap_prepare() hook, as we are in a different context when we invoke the * .mmap() hook, already having a VMA to deal with. * * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * * Once the conversion of filesystems is complete this function will no longer * be required and will be removed. * * Returns: 0 on success or error. */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { return __compat_vma_mmap_prepare(file->f_op, file, vma); } EXPORT_SYMBOL(compat_vma_mmap_prepare); static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) { /* * Only the first page of a high-order buddy page has PageBuddy() set. * So we have to check manually whether this page is part of a high- * order buddy page. */ if (PageBuddy(page)) ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; else if (page_count(page) == 0 && is_free_buddy_page(page)) ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; if (folio_test_idle(folio)) ps->flags |= PAGE_SNAPSHOT_PG_IDLE; } /** * snapshot_page() - Create a snapshot of a struct page * @ps: Pointer to a struct page_snapshot to store the page snapshot * @page: The page to snapshot * * Create a snapshot of the page and store both its struct page and struct * folio representations in @ps. * * A snapshot is marked as "faithful" if the compound state of @page was * stable and allowed safe reconstruction of the folio representation. In * rare cases where this is not possible (e.g. due to folio splitting), * snapshot_page() falls back to treating @page as a single page and the * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() * helper can be used to check for this condition. */ void snapshot_page(struct page_snapshot *ps, const struct page *page) { unsigned long head, nr_pages = 1; struct folio *foliop; int loops = 5; ps->pfn = page_to_pfn(page); ps->flags = PAGE_SNAPSHOT_FAITHFUL; again: memset(&ps->folio_snapshot, 0, sizeof(struct folio)); memcpy(&ps->page_snapshot, page, sizeof(*page)); head = ps->page_snapshot.compound_head; if ((head & 1) == 0) { ps->idx = 0; foliop = (struct folio *)&ps->page_snapshot; if (!folio_test_large(foliop)) { set_ps_flags(ps, page_folio(page), page); memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); return; } foliop = (struct folio *)page; } else { foliop = (struct folio *)(head - 1); ps->idx = folio_page_idx(foliop, page); } if (ps->idx < MAX_FOLIO_NR_PAGES) { memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); nr_pages = folio_nr_pages(&ps->folio_snapshot); if (nr_pages > 1) memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, sizeof(struct page)); set_ps_flags(ps, foliop, page); } if (ps->idx > nr_pages) { if (loops-- > 0) goto again; clear_compound_head(&ps->page_snapshot); foliop = (struct folio *)&ps->page_snapshot; memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); ps->flags = 0; ps->idx = 0; } } #ifdef CONFIG_MMU /** * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * * This is a simplified variant of folio_pte_batch_flags(). * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio in a single VMA and a single page table. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirt-bit and soft-dirty bit. * * ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single VMA and * a single page table. * * Return: the number of table entries in the batch. */ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, unsigned int max_nr) { return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } #endif /* CONFIG_MMU */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) /** * page_range_contiguous - test whether the page range is contiguous * @page: the start of the page range. * @nr_pages: the number of pages in the range. * * Test whether the page range is contiguous, such that they can be iterated * naively, corresponding to iterating a contiguous PFN range. * * This function should primarily only be used for debug checks, or when * working with page ranges that are not naturally contiguous (e.g., pages * within a folio are). * * Returns true if contiguous, otherwise false. */ bool page_range_contiguous(const struct page *page, unsigned long nr_pages) { const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn; /* * The memmap is allocated per memory section, so no need to check * within the first section. However, we need to check each other * spanned memory section once, making sure the first page in a * section could similarly be reached by just iterating pages. */ for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); pfn < end_pfn; pfn += PAGES_PER_SECTION) if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) return false; return true; } EXPORT_SYMBOL(page_range_contiguous); #endif |
| 2978 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X_TABLES_H #define _X_TABLES_H #include <linux/netdevice.h> #include <linux/static_key.h> #include <linux/netfilter.h> #include <uapi/linux/netfilter/x_tables.h> /* Test a struct->invflags and a boolean for inequality */ #define NF_INVF(ptr, flag, boolean) \ ((boolean) ^ !!((ptr)->invflags & (flag))) /** * struct xt_action_param - parameters for matches/targets * * @match: the match extension * @target: the target extension * @matchinfo: per-match data * @targetinfo: per-target data * @state: pointer to hook state this packet came from * @fragoff: packet is a fragment, this is the data offset * @thoff: position of transport header relative to skb->data * * Fields written to by extensions: * * @hotdrop: drop packet if we had inspection problems */ struct xt_action_param { union { const struct xt_match *match; const struct xt_target *target; }; union { const void *matchinfo, *targinfo; }; const struct nf_hook_state *state; unsigned int thoff; u16 fragoff; bool hotdrop; }; static inline struct net *xt_net(const struct xt_action_param *par) { return par->state->net; } static inline struct net_device *xt_in(const struct xt_action_param *par) { return par->state->in; } static inline struct net_device *xt_out(const struct xt_action_param *par) { return par->state->out; } static inline unsigned int xt_hooknum(const struct xt_action_param *par) { return par->state->hook; } static inline u_int8_t xt_family(const struct xt_action_param *par) { return par->state->pf; } /** * struct xt_mtchk_param - parameters for match extensions' * checkentry functions * * @net: network namespace through which the check was invoked * @table: table the rule is tried to be inserted into * @entryinfo: the family-specific rule data * (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry) * @match: struct xt_match through which this function was invoked * @matchinfo: per-match data * @hook_mask: via which hooks the new rule is reachable * Other fields as above. */ struct xt_mtchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_match *match; void *matchinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /** * struct xt_mdtor_param - match destructor parameters * Fields as above. */ struct xt_mtdtor_param { struct net *net; const struct xt_match *match; void *matchinfo; u_int8_t family; }; /** * struct xt_tgchk_param - parameters for target extensions' * checkentry functions * * @entryinfo: the family-specific rule data * (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry) * * Other fields see above. */ struct xt_tgchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_target *target; void *targinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /* Target destructor parameters */ struct xt_tgdtor_param { struct net *net; const struct xt_target *target; void *targinfo; u_int8_t family; }; struct xt_match { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Return true or false: return FALSE and set *hotdrop = 1 to force immediate packet drop. */ /* Arguments changed since 2.6.9, as this must now handle non-linear skb, using skb_header_pointer and skb_ip_make_writable. */ bool (*match)(const struct sk_buff *skb, struct xt_action_param *); /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int matchsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Registration hooks for targets. */ struct xt_target { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Returns verdict. Argument order changed since 2.6.9, as this must now handle non-linear skbs, using skb_copy_bits and skb_ip_make_writable. */ unsigned int (*target)(struct sk_buff *skb, const struct xt_action_param *); /* Called when user tries to insert an entry of this type: hook_mask is a bitmask of hooks from which it can be called. */ /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int targetsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Furniture shopping... */ struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; /* hook ops that register the table with the netfilter core */ struct nf_hook_ops *ops; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; }; #include <linux/netfilter_ipv4.h> /* The table itself */ struct xt_table_info { /* Size per table */ unsigned int size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ unsigned int initial_entries; /* Entry points and underflows */ unsigned int hook_entry[NF_INET_NUMHOOKS]; unsigned int underflow[NF_INET_NUMHOOKS]; /* * Number of user chains. Since tables cannot have loops, at most * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; void ***jumpstack; unsigned char entries[] __aligned(8); }; int xt_register_target(struct xt_target *target); void xt_unregister_target(struct xt_target *target); int xt_register_targets(struct xt_target *target, unsigned int n); void xt_unregister_targets(struct xt_target *target, unsigned int n); int xt_register_match(struct xt_match *target); void xt_unregister_match(struct xt_match *target); int xt_register_matches(struct xt_match *match, unsigned int n); void xt_unregister_matches(struct xt_match *match, unsigned int n); int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); int xt_check_proc_name(const char *name, unsigned int size); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u); int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u); int xt_data_to_user(void __user *dst, const void *src, int usersize, int size, int aligned_size); void *xt_copy_counters(sockptr_t arg, unsigned int len, struct xt_counters_info *info); struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error); struct xt_match *xt_find_match(u8 af, const char *name, u8 revision); struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err); struct xt_table *xt_find_table(struct net *net, u8 af, const char *name); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, const char *name); void xt_table_unlock(struct xt_table *t); int xt_proto_init(struct net *net, u_int8_t af); void xt_proto_fini(struct net *net, u_int8_t af); struct xt_table_info *xt_alloc_table_info(unsigned int size); void xt_free_table_info(struct xt_table_info *info); /** * xt_recseq - recursive seqcount for netfilter use * * Packet processing changes the seqcount only if no recursion happened * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), * because we use the normal seqcount convention : * Low order bit set to 1 if a writer is active. */ DECLARE_PER_CPU(seqcount_t, xt_recseq); /* xt_tee_enabled - true if x_tables needs to handle reentrancy * * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. */ extern struct static_key xt_tee_enabled; /** * xt_write_recseq_begin - start of a write section * * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) * Returns: * 1 if no recursion on this cpu * 0 if recursion detected */ static inline unsigned int xt_write_recseq_begin(void) { unsigned int addend; /* * Low order bit of sequence is set if we already * called xt_write_recseq_begin(). */ addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1; /* * This is kind of a write_seqcount_begin(), but addend is 0 or 1 * We dont check addend value to avoid a test and conditional jump, * since addend is most likely 1 */ __this_cpu_add(xt_recseq.sequence, addend); smp_mb(); return addend; } /** * xt_write_recseq_end - end of a write section * @addend: return value from previous xt_write_recseq_begin() * * End packet processing : all readers can proceed * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) */ static inline void xt_write_recseq_end(unsigned int addend) { /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ smp_wmb(); __this_cpu_add(xt_recseq.sequence, addend); } /* * This helper is performance critical and must be inlined */ static inline unsigned long ifname_compare_aligned(const char *_a, const char *_b, const char *_mask) { const unsigned long *a = (const unsigned long *)_a; const unsigned long *b = (const unsigned long *)_b; const unsigned long *mask = (const unsigned long *)_mask; unsigned long ret; ret = (a[0] ^ b[0]) & mask[0]; if (IFNAMSIZ > sizeof(unsigned long)) ret |= (a[1] ^ b[1]) & mask[1]; if (IFNAMSIZ > 2 * sizeof(unsigned long)) ret |= (a[2] ^ b[2]) & mask[2]; if (IFNAMSIZ > 3 * sizeof(unsigned long)) ret |= (a[3] ^ b[3]) & mask[3]; BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); return ret; } struct xt_percpu_counter_alloc_state { unsigned int off; const char __percpu *mem; }; bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, struct xt_counters *counter); void xt_percpu_counter_free(struct xt_counters *cnt); static inline struct xt_counters * xt_get_this_cpu_counter(struct xt_counters *cnt) { if (nr_cpu_ids > 1) return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt); return cnt; } static inline struct xt_counters * xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) { if (nr_cpu_ids > 1) return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu); return cnt; } struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net)); void xt_unregister_template(const struct xt_table *t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_xt_entry_match { union { struct { u_int16_t match_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t match_size; compat_uptr_t match; } kernel; u_int16_t match_size; } u; unsigned char data[]; }; struct compat_xt_entry_target { union { struct { u_int16_t target_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t target_size; compat_uptr_t target; } kernel; u_int16_t target_size; } u; unsigned char data[]; }; /* FIXME: this works only on 32 bit tasks * need to change whole approach in order to calculate align as function of * current task alignment */ struct compat_xt_counters { compat_u64 pcnt, bcnt; /* Packet and byte counters */ }; struct compat_xt_counters_info { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t num_counters; struct compat_xt_counters counters[]; }; struct _compat_xt_align { __u8 u8; __u16 u16; __u32 u32; compat_u64 u64; }; #define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align)) void xt_compat_lock(u_int8_t af); void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, unsigned int *size); int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size); int xt_compat_target_offset(const struct xt_target *target); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, unsigned int *size); int xt_compat_target_to_user(const struct xt_entry_target *t, void __user **dstptr, unsigned int *size); int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ #endif /* _X_TABLES_H */ |
| 1 1 1 1 1 1 1 1 1 1 1 1 6 2 1 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 | /* * Davicom DM96xx USB 10/100Mbps ethernet devices * * Peter Korsgaard <jacmet@sunsite.dk> * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any * kind, whether express or implied. */ //#define DEBUG #include <linux/module.h> #include <linux/sched.h> #include <linux/stddef.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/crc32.h> #include <linux/usb/usbnet.h> #include <linux/slab.h> /* datasheet: http://ptm2.cc.utu.fi/ftp/network/cards/DM9601/From_NET/DM9601-DS-P01-930914.pdf */ /* control requests */ #define DM_READ_REGS 0x00 #define DM_WRITE_REGS 0x01 #define DM_READ_MEMS 0x02 #define DM_WRITE_REG 0x03 #define DM_WRITE_MEMS 0x05 #define DM_WRITE_MEM 0x07 /* registers */ #define DM_NET_CTRL 0x00 #define DM_RX_CTRL 0x05 #define DM_SHARED_CTRL 0x0b #define DM_SHARED_ADDR 0x0c #define DM_SHARED_DATA 0x0d /* low + high */ #define DM_PHY_ADDR 0x10 /* 6 bytes */ #define DM_MCAST_ADDR 0x16 /* 8 bytes */ #define DM_GPR_CTRL 0x1e #define DM_GPR_DATA 0x1f #define DM_CHIP_ID 0x2c #define DM_MODE_CTRL 0x91 /* only on dm9620 */ /* chip id values */ #define ID_DM9601 0 #define ID_DM9620 1 #define DM_MAX_MCAST 64 #define DM_MCAST_SIZE 8 #define DM_EEPROM_LEN 256 #define DM_TX_OVERHEAD 2 /* 2 byte header */ #define DM_RX_OVERHEAD 7 /* 3 byte header + 4 byte crc tail */ #define DM_TIMEOUT 1000 static int dm_read(struct usbnet *dev, u8 reg, u16 length, void *data) { int err; err = usbnet_read_cmd(dev, DM_READ_REGS, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, reg, data, length); if(err != length && err >= 0) err = -EINVAL; return err; } static int dm_read_reg(struct usbnet *dev, u8 reg, u8 *value) { return dm_read(dev, reg, 1, value); } static int dm_write(struct usbnet *dev, u8 reg, u16 length, void *data) { int err; err = usbnet_write_cmd(dev, DM_WRITE_REGS, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, reg, data, length); if (err >= 0 && err < length) err = -EINVAL; return err; } static int dm_write_reg(struct usbnet *dev, u8 reg, u8 value) { return usbnet_write_cmd(dev, DM_WRITE_REG, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, reg, NULL, 0); } static void dm_write_async(struct usbnet *dev, u8 reg, u16 length, const void *data) { usbnet_write_cmd_async(dev, DM_WRITE_REGS, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, reg, data, length); } static void dm_write_reg_async(struct usbnet *dev, u8 reg, u8 value) { usbnet_write_cmd_async(dev, DM_WRITE_REG, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, reg, NULL, 0); } static int dm_read_shared_word(struct usbnet *dev, int phy, u8 reg, __le16 *value) { int ret, i; mutex_lock(&dev->phy_mutex); dm_write_reg(dev, DM_SHARED_ADDR, phy ? (reg | 0x40) : reg); dm_write_reg(dev, DM_SHARED_CTRL, phy ? 0xc : 0x4); for (i = 0; i < DM_TIMEOUT; i++) { u8 tmp = 0; udelay(1); ret = dm_read_reg(dev, DM_SHARED_CTRL, &tmp); if (ret < 0) goto out; /* ready */ if ((tmp & 1) == 0) break; } if (i == DM_TIMEOUT) { netdev_err(dev->net, "%s read timed out!\n", phy ? "phy" : "eeprom"); ret = -EIO; goto out; } dm_write_reg(dev, DM_SHARED_CTRL, 0x0); ret = dm_read(dev, DM_SHARED_DATA, 2, value); netdev_dbg(dev->net, "read shared %d 0x%02x returned 0x%04x, %d\n", phy, reg, *value, ret); out: mutex_unlock(&dev->phy_mutex); return ret; } static int dm_write_shared_word(struct usbnet *dev, int phy, u8 reg, __le16 value) { int ret, i; mutex_lock(&dev->phy_mutex); ret = dm_write(dev, DM_SHARED_DATA, 2, &value); if (ret < 0) goto out; dm_write_reg(dev, DM_SHARED_ADDR, phy ? (reg | 0x40) : reg); dm_write_reg(dev, DM_SHARED_CTRL, phy ? 0x1a : 0x12); for (i = 0; i < DM_TIMEOUT; i++) { u8 tmp = 0; udelay(1); ret = dm_read_reg(dev, DM_SHARED_CTRL, &tmp); if (ret < 0) goto out; /* ready */ if ((tmp & 1) == 0) break; } if (i == DM_TIMEOUT) { netdev_err(dev->net, "%s write timed out!\n", phy ? "phy" : "eeprom"); ret = -EIO; goto out; } dm_write_reg(dev, DM_SHARED_CTRL, 0x0); out: mutex_unlock(&dev->phy_mutex); return ret; } static int dm_read_eeprom_word(struct usbnet *dev, u8 offset, void *value) { return dm_read_shared_word(dev, 0, offset, value); } static int dm9601_get_eeprom_len(struct net_device *dev) { return DM_EEPROM_LEN; } static int dm9601_get_eeprom(struct net_device *net, struct ethtool_eeprom *eeprom, u8 * data) { struct usbnet *dev = netdev_priv(net); __le16 *ebuf = (__le16 *) data; int i; /* access is 16bit */ if ((eeprom->offset % 2) || (eeprom->len % 2)) return -EINVAL; for (i = 0; i < eeprom->len / 2; i++) { if (dm_read_eeprom_word(dev, eeprom->offset / 2 + i, &ebuf[i]) < 0) return -EINVAL; } return 0; } static int dm9601_mdio_read(struct net_device *netdev, int phy_id, int loc) { struct usbnet *dev = netdev_priv(netdev); __le16 res; int err; if (phy_id) { netdev_dbg(dev->net, "Only internal phy supported\n"); return 0; } err = dm_read_shared_word(dev, 1, loc, &res); if (err < 0) { netdev_err(dev->net, "MDIO read error: %d\n", err); return 0; } netdev_dbg(dev->net, "dm9601_mdio_read() phy_id=0x%02x, loc=0x%02x, returns=0x%04x\n", phy_id, loc, le16_to_cpu(res)); return le16_to_cpu(res); } static void dm9601_mdio_write(struct net_device *netdev, int phy_id, int loc, int val) { struct usbnet *dev = netdev_priv(netdev); __le16 res = cpu_to_le16(val); if (phy_id) { netdev_dbg(dev->net, "Only internal phy supported\n"); return; } netdev_dbg(dev->net, "dm9601_mdio_write() phy_id=0x%02x, loc=0x%02x, val=0x%04x\n", phy_id, loc, val); dm_write_shared_word(dev, 1, loc, res); } static void dm9601_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { /* Inherit standard device info */ usbnet_get_drvinfo(net, info); } static u32 dm9601_get_link(struct net_device *net) { struct usbnet *dev = netdev_priv(net); return mii_link_ok(&dev->mii); } static int dm9601_ioctl(struct net_device *net, struct ifreq *rq, int cmd) { struct usbnet *dev = netdev_priv(net); return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL); } static const struct ethtool_ops dm9601_ethtool_ops = { .get_drvinfo = dm9601_get_drvinfo, .get_link = dm9601_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_eeprom_len = dm9601_get_eeprom_len, .get_eeprom = dm9601_get_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void dm9601_set_multicast(struct net_device *net) { struct usbnet *dev = netdev_priv(net); /* We use the 20 byte dev->data for our 8 byte filter buffer * to avoid allocating memory that is tricky to free later */ u8 *hashes = (u8 *) & dev->data; u8 rx_ctl = 0x31; memset(hashes, 0x00, DM_MCAST_SIZE); hashes[DM_MCAST_SIZE - 1] |= 0x80; /* broadcast address */ if (net->flags & IFF_PROMISC) { rx_ctl |= 0x02; } else if (net->flags & IFF_ALLMULTI || netdev_mc_count(net) > DM_MAX_MCAST) { rx_ctl |= 0x08; } else if (!netdev_mc_empty(net)) { struct netdev_hw_addr *ha; netdev_for_each_mc_addr(ha, net) { u32 crc = ether_crc(ETH_ALEN, ha->addr) >> 26; hashes[crc >> 3] |= 1 << (crc & 0x7); } } dm_write_async(dev, DM_MCAST_ADDR, DM_MCAST_SIZE, hashes); dm_write_reg_async(dev, DM_RX_CTRL, rx_ctl); } static void __dm9601_set_mac_address(struct usbnet *dev) { dm_write_async(dev, DM_PHY_ADDR, ETH_ALEN, dev->net->dev_addr); } static int dm9601_set_mac_address(struct net_device *net, void *p) { struct sockaddr *addr = p; struct usbnet *dev = netdev_priv(net); if (!is_valid_ether_addr(addr->sa_data)) { dev_err(&net->dev, "not setting invalid mac address %pM\n", addr->sa_data); return -EINVAL; } eth_hw_addr_set(net, addr->sa_data); __dm9601_set_mac_address(dev); return 0; } static const struct net_device_ops dm9601_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = dm9601_ioctl, .ndo_set_rx_mode = dm9601_set_multicast, .ndo_set_mac_address = dm9601_set_mac_address, }; static int dm9601_bind(struct usbnet *dev, struct usb_interface *intf) { int ret; u8 mac[ETH_ALEN], id; ret = usbnet_get_endpoints(dev, intf); if (ret) goto out; dev->net->netdev_ops = &dm9601_netdev_ops; dev->net->ethtool_ops = &dm9601_ethtool_ops; dev->net->hard_header_len += DM_TX_OVERHEAD; dev->hard_mtu = dev->net->mtu + dev->net->hard_header_len; /* dm9620/21a require room for 4 byte padding, even in dm9601 * mode, so we need +1 to be able to receive full size * ethernet frames. */ dev->rx_urb_size = dev->net->mtu + ETH_HLEN + DM_RX_OVERHEAD + 1; dev->mii.dev = dev->net; dev->mii.mdio_read = dm9601_mdio_read; dev->mii.mdio_write = dm9601_mdio_write; dev->mii.phy_id_mask = 0x1f; dev->mii.reg_num_mask = 0x1f; /* reset */ dm_write_reg(dev, DM_NET_CTRL, 1); udelay(20); /* read MAC */ if (dm_read(dev, DM_PHY_ADDR, ETH_ALEN, mac) < 0) { printk(KERN_ERR "Error reading MAC address\n"); ret = -ENODEV; goto out; } /* * Overwrite the auto-generated address only with good ones. */ if (is_valid_ether_addr(mac)) eth_hw_addr_set(dev->net, mac); else { printk(KERN_WARNING "dm9601: No valid MAC address in EEPROM, using %pM\n", dev->net->dev_addr); __dm9601_set_mac_address(dev); } if (dm_read_reg(dev, DM_CHIP_ID, &id) < 0) { netdev_err(dev->net, "Error reading chip ID\n"); ret = -ENODEV; goto out; } /* put dm9620 devices in dm9601 mode */ if (id == ID_DM9620) { u8 mode; if (dm_read_reg(dev, DM_MODE_CTRL, &mode) < 0) { netdev_err(dev->net, "Error reading MODE_CTRL\n"); ret = -ENODEV; goto out; } dm_write_reg(dev, DM_MODE_CTRL, mode & 0x7f); } /* power up phy */ dm_write_reg(dev, DM_GPR_CTRL, 1); dm_write_reg(dev, DM_GPR_DATA, 0); /* receive broadcast packets */ dm9601_set_multicast(dev->net); dm9601_mdio_write(dev->net, dev->mii.phy_id, MII_BMCR, BMCR_RESET); dm9601_mdio_write(dev->net, dev->mii.phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP); mii_nway_restart(&dev->mii); out: return ret; } static int dm9601_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { u8 status; int len; /* format: b1: rx status b2: packet length (incl crc) low b3: packet length (incl crc) high b4..n-4: packet data bn-3..bn: ethernet crc */ if (unlikely(skb->len < DM_RX_OVERHEAD)) { dev_err(&dev->udev->dev, "unexpected tiny rx frame\n"); return 0; } status = skb->data[0]; len = (skb->data[1] | (skb->data[2] << 8)) - 4; if (unlikely(status & 0xbf)) { if (status & 0x01) dev->net->stats.rx_fifo_errors++; if (status & 0x02) dev->net->stats.rx_crc_errors++; if (status & 0x04) dev->net->stats.rx_frame_errors++; if (status & 0x20) dev->net->stats.rx_missed_errors++; if (status & 0x90) dev->net->stats.rx_length_errors++; return 0; } skb_pull(skb, 3); skb_trim(skb, len); return 1; } static struct sk_buff *dm9601_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int len, pad; /* format: b1: packet length low b2: packet length high b3..n: packet data */ len = skb->len + DM_TX_OVERHEAD; /* workaround for dm962x errata with tx fifo getting out of * sync if a USB bulk transfer retry happens right after a * packet with odd / maxpacket length by adding up to 3 bytes * padding. */ while ((len & 1) || !(len % dev->maxpacket)) len++; len -= DM_TX_OVERHEAD; /* hw header doesn't count as part of length */ pad = len - skb->len; if (skb_headroom(skb) < DM_TX_OVERHEAD || skb_tailroom(skb) < pad) { struct sk_buff *skb2; skb2 = skb_copy_expand(skb, DM_TX_OVERHEAD, pad, flags); dev_kfree_skb_any(skb); skb = skb2; if (!skb) return NULL; } __skb_push(skb, DM_TX_OVERHEAD); if (pad) { memset(skb->data + skb->len, 0, pad); __skb_put(skb, pad); } skb->data[0] = len; skb->data[1] = len >> 8; return skb; } static void dm9601_status(struct usbnet *dev, struct urb *urb) { int link; u8 *buf; /* format: b0: net status b1: tx status 1 b2: tx status 2 b3: rx status b4: rx overflow b5: rx count b6: tx count b7: gpr */ if (urb->actual_length < 8) return; buf = urb->transfer_buffer; link = !!(buf[0] & 0x40); if (netif_carrier_ok(dev->net) != link) { usbnet_link_change(dev, link, 1); netdev_dbg(dev->net, "Link Status is: %d\n", link); } } static int dm9601_link_reset(struct usbnet *dev) { struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); netdev_dbg(dev->net, "link_reset() speed: %u duplex: %d\n", ethtool_cmd_speed(&ecmd), ecmd.duplex); return 0; } static const struct driver_info dm9601_info = { .description = "Davicom DM96xx USB 10/100 Ethernet", .flags = FLAG_ETHER | FLAG_LINK_INTR, .bind = dm9601_bind, .rx_fixup = dm9601_rx_fixup, .tx_fixup = dm9601_tx_fixup, .status = dm9601_status, .link_reset = dm9601_link_reset, .reset = dm9601_link_reset, }; static const struct usb_device_id products[] = { { USB_DEVICE(0x07aa, 0x9601), /* Corega FEther USB-TXC */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x9601), /* Davicom USB-100 */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x6688), /* ZT6688 USB NIC */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x0268), /* ShanTou ST268 USB NIC */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x8515), /* ADMtek ADM8515 USB NIC */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a47, 0x9601), /* Hirose USB-100 */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0fe6, 0x8101), /* DM9601 USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0fe6, 0x9700), /* DM9601 USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x9000), /* DM9000E */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x9620), /* DM9620 USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x9621), /* DM9621A USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x9622), /* DM9622 USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x0269), /* DM962OA USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0a46, 0x1269), /* DM9621A USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, { USB_DEVICE(0x0586, 0x3427), /* ZyXEL Keenetic Plus DSL xDSL modem */ .driver_info = (unsigned long)&dm9601_info, }, {}, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver dm9601_driver = { .name = "dm9601", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(dm9601_driver); MODULE_AUTHOR("Peter Korsgaard <jacmet@sunsite.dk>"); MODULE_DESCRIPTION("Davicom DM96xx USB 10/100 ethernet devices"); MODULE_LICENSE("GPL"); |
| 8 6 2 2 7 5 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 | // SPDX-License-Identifier: GPL-2.0-or-later /* Kernel cryptographic api. * cast5.c - Cast5 cipher algorithm (rfc2144). * * Derived from GnuPG implementation of cast5. * * Major Changes. * Complete conformance to rfc2144. * Supports key size from 40 to 128 bits. * * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. * Copyright (C) 2003 Kartikey Mahendra Bhatt <kartik_me@hotmail.com>. */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/cast5.h> static const u32 s5[256] = { 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f, 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a, 0xe6a2e77f, 0xf0c720cd, 0xc4494816, 0xccf5c180, 0x38851640, 0x15b0a848, 0xe68b18cb, 0x4caadeff, 0x5f480a01, 0x0412b2aa, 0x259814fc, 0x41d0efe2, 0x4e40b48d, 0x248eb6fb, 0x8dba1cfe, 0x41a99b02, 0x1a550a04, 0xba8f65cb, 0x7251f4e7, 0x95a51725, 0xc106ecd7, 0x97a5980a, 0xc539b9aa, 0x4d79fe6a, 0xf2f3f763, 0x68af8040, 0xed0c9e56, 0x11b4958b, 0xe1eb5a88, 0x8709e6b0, 0xd7e07156, 0x4e29fea7, 0x6366e52d, 0x02d1c000, 0xc4ac8e05, 0x9377f571, 0x0c05372a, 0x578535f2, 0x2261be02, 0xd642a0c9, 0xdf13a280, 0x74b55bd2, 0x682199c0, 0xd421e5ec, 0x53fb3ce8, 0xc8adedb3, 0x28a87fc9, 0x3d959981, 0x5c1ff900, 0xfe38d399, 0x0c4eff0b, 0x062407ea, 0xaa2f4fb1, 0x4fb96976, 0x90c79505, 0xb0a8a774, 0xef55a1ff, 0xe59ca2c2, 0xa6b62d27, 0xe66a4263, 0xdf65001f, 0x0ec50966, 0xdfdd55bc, 0x29de0655, 0x911e739a, 0x17af8975, 0x32c7911c, 0x89f89468, 0x0d01e980, 0x524755f4, 0x03b63cc9, 0x0cc844b2, 0xbcf3f0aa, 0x87ac36e9, 0xe53a7426, 0x01b3d82b, 0x1a9e7449, 0x64ee2d7e, 0xcddbb1da, 0x01c94910, 0xb868bf80, 0x0d26f3fd, 0x9342ede7, 0x04a5c284, 0x636737b6, 0x50f5b616, 0xf24766e3, 0x8eca36c1, 0x136e05db, 0xfef18391, 0xfb887a37, 0xd6e7f7d4, 0xc7fb7dc9, 0x3063fcdf, 0xb6f589de, 0xec2941da, 0x26e46695, 0xb7566419, 0xf654efc5, 0xd08d58b7, 0x48925401, 0xc1bacb7f, 0xe5ff550f, 0xb6083049, 0x5bb5d0e8, 0x87d72e5a, 0xab6a6ee1, 0x223a66ce, 0xc62bf3cd, 0x9e0885f9, 0x68cb3e47, 0x086c010f, 0xa21de820, 0xd18b69de, 0xf3f65777, 0xfa02c3f6, 0x407edac3, 0xcbb3d550, 0x1793084d, 0xb0d70eba, 0x0ab378d5, 0xd951fb0c, 0xded7da56, 0x4124bbe4, 0x94ca0b56, 0x0f5755d1, 0xe0e1e56e, 0x6184b5be, 0x580a249f, 0x94f74bc0, 0xe327888e, 0x9f7b5561, 0xc3dc0280, 0x05687715, 0x646c6bd7, 0x44904db3, 0x66b4f0a3, 0xc0f1648a, 0x697ed5af, 0x49e92ff6, 0x309e374f, 0x2cb6356a, 0x85808573, 0x4991f840, 0x76f0ae02, 0x083be84d, 0x28421c9a, 0x44489406, 0x736e4cb8, 0xc1092910, 0x8bc95fc6, 0x7d869cf4, 0x134f616f, 0x2e77118d, 0xb31b2be1, 0xaa90b472, 0x3ca5d717, 0x7d161bba, 0x9cad9010, 0xaf462ba2, 0x9fe459d2, 0x45d34559, 0xd9f2da13, 0xdbc65487, 0xf3e4f94e, 0x176d486f, 0x097c13ea, 0x631da5c7, 0x445f7382, 0x175683f4, 0xcdc66a97, 0x70be0288, 0xb3cdcf72, 0x6e5dd2f3, 0x20936079, 0x459b80a5, 0xbe60e2db, 0xa9c23101, 0xeba5315c, 0x224e42f2, 0x1c5c1572, 0xf6721b2c, 0x1ad2fff3, 0x8c25404e, 0x324ed72f, 0x4067b7fd, 0x0523138e, 0x5ca3bc78, 0xdc0fd66e, 0x75922283, 0x784d6b17, 0x58ebb16e, 0x44094f85, 0x3f481d87, 0xfcfeae7b, 0x77b5ff76, 0x8c2302bf, 0xaaf47556, 0x5f46b02a, 0x2b092801, 0x3d38f5f7, 0x0ca81f36, 0x52af4a8a, 0x66d5e7c0, 0xdf3b0874, 0x95055110, 0x1b5ad7a8, 0xf61ed5ad, 0x6cf6e479, 0x20758184, 0xd0cefa65, 0x88f7be58, 0x4a046826, 0x0ff6f8f3, 0xa09c7f70, 0x5346aba0, 0x5ce96c28, 0xe176eda3, 0x6bac307f, 0x376829d2, 0x85360fa9, 0x17e3fe2a, 0x24b79767, 0xf5a96b20, 0xd6cd2595, 0x68ff1ebf, 0x7555442c, 0xf19f06be, 0xf9e0659a, 0xeeb9491d, 0x34010718, 0xbb30cab8, 0xe822fe15, 0x88570983, 0x750e6249, 0xda627e55, 0x5e76ffa8, 0xb1534546, 0x6d47de08, 0xefe9e7d4 }; static const u32 s6[256] = { 0xf6fa8f9d, 0x2cac6ce1, 0x4ca34867, 0xe2337f7c, 0x95db08e7, 0x016843b4, 0xeced5cbc, 0x325553ac, 0xbf9f0960, 0xdfa1e2ed, 0x83f0579d, 0x63ed86b9, 0x1ab6a6b8, 0xde5ebe39, 0xf38ff732, 0x8989b138, 0x33f14961, 0xc01937bd, 0xf506c6da, 0xe4625e7e, 0xa308ea99, 0x4e23e33c, 0x79cbd7cc, 0x48a14367, 0xa3149619, 0xfec94bd5, 0xa114174a, 0xeaa01866, 0xa084db2d, 0x09a8486f, 0xa888614a, 0x2900af98, 0x01665991, 0xe1992863, 0xc8f30c60, 0x2e78ef3c, 0xd0d51932, 0xcf0fec14, 0xf7ca07d2, 0xd0a82072, 0xfd41197e, 0x9305a6b0, 0xe86be3da, 0x74bed3cd, 0x372da53c, 0x4c7f4448, 0xdab5d440, 0x6dba0ec3, 0x083919a7, 0x9fbaeed9, 0x49dbcfb0, 0x4e670c53, 0x5c3d9c01, 0x64bdb941, 0x2c0e636a, 0xba7dd9cd, 0xea6f7388, 0xe70bc762, 0x35f29adb, 0x5c4cdd8d, 0xf0d48d8c, 0xb88153e2, 0x08a19866, 0x1ae2eac8, 0x284caf89, 0xaa928223, 0x9334be53, 0x3b3a21bf, 0x16434be3, 0x9aea3906, 0xefe8c36e, 0xf890cdd9, 0x80226dae, 0xc340a4a3, 0xdf7e9c09, 0xa694a807, 0x5b7c5ecc, 0x221db3a6, 0x9a69a02f, 0x68818a54, 0xceb2296f, 0x53c0843a, 0xfe893655, 0x25bfe68a, 0xb4628abc, 0xcf222ebf, 0x25ac6f48, 0xa9a99387, 0x53bddb65, 0xe76ffbe7, 0xe967fd78, 0x0ba93563, 0x8e342bc1, 0xe8a11be9, 0x4980740d, 0xc8087dfc, 0x8de4bf99, 0xa11101a0, 0x7fd37975, 0xda5a26c0, 0xe81f994f, 0x9528cd89, 0xfd339fed, 0xb87834bf, 0x5f04456d, 0x22258698, 0xc9c4c83b, 0x2dc156be, 0x4f628daa, 0x57f55ec5, 0xe2220abe, 0xd2916ebf, 0x4ec75b95, 0x24f2c3c0, 0x42d15d99, 0xcd0d7fa0, 0x7b6e27ff, 0xa8dc8af0, 0x7345c106, 0xf41e232f, 0x35162386, 0xe6ea8926, 0x3333b094, 0x157ec6f2, 0x372b74af, 0x692573e4, 0xe9a9d848, 0xf3160289, 0x3a62ef1d, 0xa787e238, 0xf3a5f676, 0x74364853, 0x20951063, 0x4576698d, 0xb6fad407, 0x592af950, 0x36f73523, 0x4cfb6e87, 0x7da4cec0, 0x6c152daa, 0xcb0396a8, 0xc50dfe5d, 0xfcd707ab, 0x0921c42f, 0x89dff0bb, 0x5fe2be78, 0x448f4f33, 0x754613c9, 0x2b05d08d, 0x48b9d585, 0xdc049441, 0xc8098f9b, 0x7dede786, 0xc39a3373, 0x42410005, 0x6a091751, 0x0ef3c8a6, 0x890072d6, 0x28207682, 0xa9a9f7be, 0xbf32679d, 0xd45b5b75, 0xb353fd00, 0xcbb0e358, 0x830f220a, 0x1f8fb214, 0xd372cf08, 0xcc3c4a13, 0x8cf63166, 0x061c87be, 0x88c98f88, 0x6062e397, 0x47cf8e7a, 0xb6c85283, 0x3cc2acfb, 0x3fc06976, 0x4e8f0252, 0x64d8314d, 0xda3870e3, 0x1e665459, 0xc10908f0, 0x513021a5, 0x6c5b68b7, 0x822f8aa0, 0x3007cd3e, 0x74719eef, 0xdc872681, 0x073340d4, 0x7e432fd9, 0x0c5ec241, 0x8809286c, 0xf592d891, 0x08a930f6, 0x957ef305, 0xb7fbffbd, 0xc266e96f, 0x6fe4ac98, 0xb173ecc0, 0xbc60b42a, 0x953498da, 0xfba1ae12, 0x2d4bd736, 0x0f25faab, 0xa4f3fceb, 0xe2969123, 0x257f0c3d, 0x9348af49, 0x361400bc, 0xe8816f4a, 0x3814f200, 0xa3f94043, 0x9c7a54c2, 0xbc704f57, 0xda41e7f9, 0xc25ad33a, 0x54f4a084, 0xb17f5505, 0x59357cbe, 0xedbd15c8, 0x7f97c5ab, 0xba5ac7b5, 0xb6f6deaf, 0x3a479c3a, 0x5302da25, 0x653d7e6a, 0x54268d49, 0x51a477ea, 0x5017d55b, 0xd7d25d88, 0x44136c76, 0x0404a8c8, 0xb8e5a121, 0xb81a928a, 0x60ed5869, 0x97c55b96, 0xeaec991b, 0x29935913, 0x01fdb7f1, 0x088e8dfa, 0x9ab6f6f5, 0x3b4cbf9f, 0x4a5de3ab, 0xe6051d35, 0xa0e1d855, 0xd36b4cf1, 0xf544edeb, 0xb0e93524, 0xbebb8fbd, 0xa2d762cf, 0x49c92f54, 0x38b5f331, 0x7128a454, 0x48392905, 0xa65b1db8, 0x851c97bd, 0xd675cf2f }; static const u32 s7[256] = { 0x85e04019, 0x332bf567, 0x662dbfff, 0xcfc65693, 0x2a8d7f6f, 0xab9bc912, 0xde6008a1, 0x2028da1f, 0x0227bce7, 0x4d642916, 0x18fac300, 0x50f18b82, 0x2cb2cb11, 0xb232e75c, 0x4b3695f2, 0xb28707de, 0xa05fbcf6, 0xcd4181e9, 0xe150210c, 0xe24ef1bd, 0xb168c381, 0xfde4e789, 0x5c79b0d8, 0x1e8bfd43, 0x4d495001, 0x38be4341, 0x913cee1d, 0x92a79c3f, 0x089766be, 0xbaeeadf4, 0x1286becf, 0xb6eacb19, 0x2660c200, 0x7565bde4, 0x64241f7a, 0x8248dca9, 0xc3b3ad66, 0x28136086, 0x0bd8dfa8, 0x356d1cf2, 0x107789be, 0xb3b2e9ce, 0x0502aa8f, 0x0bc0351e, 0x166bf52a, 0xeb12ff82, 0xe3486911, 0xd34d7516, 0x4e7b3aff, 0x5f43671b, 0x9cf6e037, 0x4981ac83, 0x334266ce, 0x8c9341b7, 0xd0d854c0, 0xcb3a6c88, 0x47bc2829, 0x4725ba37, 0xa66ad22b, 0x7ad61f1e, 0x0c5cbafa, 0x4437f107, 0xb6e79962, 0x42d2d816, 0x0a961288, 0xe1a5c06e, 0x13749e67, 0x72fc081a, 0xb1d139f7, 0xf9583745, 0xcf19df58, 0xbec3f756, 0xc06eba30, 0x07211b24, 0x45c28829, 0xc95e317f, 0xbc8ec511, 0x38bc46e9, 0xc6e6fa14, 0xbae8584a, 0xad4ebc46, 0x468f508b, 0x7829435f, 0xf124183b, 0x821dba9f, 0xaff60ff4, 0xea2c4e6d, 0x16e39264, 0x92544a8b, 0x009b4fc3, 0xaba68ced, 0x9ac96f78, 0x06a5b79a, 0xb2856e6e, 0x1aec3ca9, 0xbe838688, 0x0e0804e9, 0x55f1be56, 0xe7e5363b, 0xb3a1f25d, 0xf7debb85, 0x61fe033c, 0x16746233, 0x3c034c28, 0xda6d0c74, 0x79aac56c, 0x3ce4e1ad, 0x51f0c802, 0x98f8f35a, 0x1626a49f, 0xeed82b29, 0x1d382fe3, 0x0c4fb99a, 0xbb325778, 0x3ec6d97b, 0x6e77a6a9, 0xcb658b5c, 0xd45230c7, 0x2bd1408b, 0x60c03eb7, 0xb9068d78, 0xa33754f4, 0xf430c87d, 0xc8a71302, 0xb96d8c32, 0xebd4e7be, 0xbe8b9d2d, 0x7979fb06, 0xe7225308, 0x8b75cf77, 0x11ef8da4, 0xe083c858, 0x8d6b786f, 0x5a6317a6, 0xfa5cf7a0, 0x5dda0033, 0xf28ebfb0, 0xf5b9c310, 0xa0eac280, 0x08b9767a, 0xa3d9d2b0, 0x79d34217, 0x021a718d, 0x9ac6336a, 0x2711fd60, 0x438050e3, 0x069908a8, 0x3d7fedc4, 0x826d2bef, 0x4eeb8476, 0x488dcf25, 0x36c9d566, 0x28e74e41, 0xc2610aca, 0x3d49a9cf, 0xbae3b9df, 0xb65f8de6, 0x92aeaf64, 0x3ac7d5e6, 0x9ea80509, 0xf22b017d, 0xa4173f70, 0xdd1e16c3, 0x15e0d7f9, 0x50b1b887, 0x2b9f4fd5, 0x625aba82, 0x6a017962, 0x2ec01b9c, 0x15488aa9, 0xd716e740, 0x40055a2c, 0x93d29a22, 0xe32dbf9a, 0x058745b9, 0x3453dc1e, 0xd699296e, 0x496cff6f, 0x1c9f4986, 0xdfe2ed07, 0xb87242d1, 0x19de7eae, 0x053e561a, 0x15ad6f8c, 0x66626c1c, 0x7154c24c, 0xea082b2a, 0x93eb2939, 0x17dcb0f0, 0x58d4f2ae, 0x9ea294fb, 0x52cf564c, 0x9883fe66, 0x2ec40581, 0x763953c3, 0x01d6692e, 0xd3a0c108, 0xa1e7160e, 0xe4f2dfa6, 0x693ed285, 0x74904698, 0x4c2b0edd, 0x4f757656, 0x5d393378, 0xa132234f, 0x3d321c5d, 0xc3f5e194, 0x4b269301, 0xc79f022f, 0x3c997e7e, 0x5e4f9504, 0x3ffafbbd, 0x76f7ad0e, 0x296693f4, 0x3d1fce6f, 0xc61e45be, 0xd3b5ab34, 0xf72bf9b7, 0x1b0434c0, 0x4e72b567, 0x5592a33d, 0xb5229301, 0xcfd2a87f, 0x60aeb767, 0x1814386b, 0x30bcc33d, 0x38a0c07d, 0xfd1606f2, 0xc363519b, 0x589dd390, 0x5479f8e6, 0x1cb8d647, 0x97fd61a9, 0xea7759f4, 0x2d57539d, 0x569a58cf, 0xe84e63ad, 0x462e1b78, 0x6580f87e, 0xf3817914, 0x91da55f4, 0x40a230f3, 0xd1988f35, 0xb6e318d2, 0x3ffa50bc, 0x3d40f021, 0xc3c0bdae, 0x4958c24c, 0x518f36b2, 0x84b1d370, 0x0fedce83, 0x878ddada, 0xf2a279c7, 0x94e01be8, 0x90716f4b, 0x954b8aa3 }; static const u32 sb8[256] = { 0xe216300d, 0xbbddfffc, 0xa7ebdabd, 0x35648095, 0x7789f8b7, 0xe6c1121b, 0x0e241600, 0x052ce8b5, 0x11a9cfb0, 0xe5952f11, 0xece7990a, 0x9386d174, 0x2a42931c, 0x76e38111, 0xb12def3a, 0x37ddddfc, 0xde9adeb1, 0x0a0cc32c, 0xbe197029, 0x84a00940, 0xbb243a0f, 0xb4d137cf, 0xb44e79f0, 0x049eedfd, 0x0b15a15d, 0x480d3168, 0x8bbbde5a, 0x669ded42, 0xc7ece831, 0x3f8f95e7, 0x72df191b, 0x7580330d, 0x94074251, 0x5c7dcdfa, 0xabbe6d63, 0xaa402164, 0xb301d40a, 0x02e7d1ca, 0x53571dae, 0x7a3182a2, 0x12a8ddec, 0xfdaa335d, 0x176f43e8, 0x71fb46d4, 0x38129022, 0xce949ad4, 0xb84769ad, 0x965bd862, 0x82f3d055, 0x66fb9767, 0x15b80b4e, 0x1d5b47a0, 0x4cfde06f, 0xc28ec4b8, 0x57e8726e, 0x647a78fc, 0x99865d44, 0x608bd593, 0x6c200e03, 0x39dc5ff6, 0x5d0b00a3, 0xae63aff2, 0x7e8bd632, 0x70108c0c, 0xbbd35049, 0x2998df04, 0x980cf42a, 0x9b6df491, 0x9e7edd53, 0x06918548, 0x58cb7e07, 0x3b74ef2e, 0x522fffb1, 0xd24708cc, 0x1c7e27cd, 0xa4eb215b, 0x3cf1d2e2, 0x19b47a38, 0x424f7618, 0x35856039, 0x9d17dee7, 0x27eb35e6, 0xc9aff67b, 0x36baf5b8, 0x09c467cd, 0xc18910b1, 0xe11dbf7b, 0x06cd1af8, 0x7170c608, 0x2d5e3354, 0xd4de495a, 0x64c6d006, 0xbcc0c62c, 0x3dd00db3, 0x708f8f34, 0x77d51b42, 0x264f620f, 0x24b8d2bf, 0x15c1b79e, 0x46a52564, 0xf8d7e54e, 0x3e378160, 0x7895cda5, 0x859c15a5, 0xe6459788, 0xc37bc75f, 0xdb07ba0c, 0x0676a3ab, 0x7f229b1e, 0x31842e7b, 0x24259fd7, 0xf8bef472, 0x835ffcb8, 0x6df4c1f2, 0x96f5b195, 0xfd0af0fc, 0xb0fe134c, 0xe2506d3d, 0x4f9b12ea, 0xf215f225, 0xa223736f, 0x9fb4c428, 0x25d04979, 0x34c713f8, 0xc4618187, 0xea7a6e98, 0x7cd16efc, 0x1436876c, 0xf1544107, 0xbedeee14, 0x56e9af27, 0xa04aa441, 0x3cf7c899, 0x92ecbae6, 0xdd67016d, 0x151682eb, 0xa842eedf, 0xfdba60b4, 0xf1907b75, 0x20e3030f, 0x24d8c29e, 0xe139673b, 0xefa63fb8, 0x71873054, 0xb6f2cf3b, 0x9f326442, 0xcb15a4cc, 0xb01a4504, 0xf1e47d8d, 0x844a1be5, 0xbae7dfdc, 0x42cbda70, 0xcd7dae0a, 0x57e85b7a, 0xd53f5af6, 0x20cf4d8c, 0xcea4d428, 0x79d130a4, 0x3486ebfb, 0x33d3cddc, 0x77853b53, 0x37effcb5, 0xc5068778, 0xe580b3e6, 0x4e68b8f4, 0xc5c8b37e, 0x0d809ea2, 0x398feb7c, 0x132a4f94, 0x43b7950e, 0x2fee7d1c, 0x223613bd, 0xdd06caa2, 0x37df932b, 0xc4248289, 0xacf3ebc3, 0x5715f6b7, 0xef3478dd, 0xf267616f, 0xc148cbe4, 0x9052815e, 0x5e410fab, 0xb48a2465, 0x2eda7fa4, 0xe87b40e4, 0xe98ea084, 0x5889e9e1, 0xefd390fc, 0xdd07d35b, 0xdb485694, 0x38d7e5b2, 0x57720101, 0x730edebc, 0x5b643113, 0x94917e4f, 0x503c2fba, 0x646f1282, 0x7523d24a, 0xe0779695, 0xf9c17a8f, 0x7a5b2121, 0xd187b896, 0x29263a4d, 0xba510cdf, 0x81f47c9f, 0xad1163ed, 0xea7b5965, 0x1a00726e, 0x11403092, 0x00da6d77, 0x4a0cdd61, 0xad1f4603, 0x605bdfb0, 0x9eedc364, 0x22ebe6a8, 0xcee7d28a, 0xa0e736a0, 0x5564a6b9, 0x10853209, 0xc7eb8f37, 0x2de705ca, 0x8951570f, 0xdf09822b, 0xbd691a6c, 0xaa12e4f2, 0x87451c0f, 0xe0f6a27a, 0x3ada4819, 0x4cf1764f, 0x0d771c2b, 0x67cdb156, 0x350d8384, 0x5938fa0f, 0x42399ef3, 0x36997b07, 0x0e84093d, 0x4aa93e61, 0x8360d87b, 0x1fa98b0c, 0x1149382c, 0xe97625a5, 0x0614d1b7, 0x0e25244b, 0x0c768347, 0x589e8d82, 0x0d2059d1, 0xa466bb1e, 0xf8da0a82, 0x04f19130, 0xba6e4ec0, 0x99265164, 0x1ee7230d, 0x50b2ad80, 0xeaee6801, 0x8db2a283, 0xea8bf59e }; #define s1 cast_s1 #define s2 cast_s2 #define s3 cast_s3 #define s4 cast_s4 #define F1(D, m, r) ((I = ((m) + (D))), (I = rol32(I, (r))), \ (((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff])) #define F2(D, m, r) ((I = ((m) ^ (D))), (I = rol32(I, (r))), \ (((s1[I >> 24] - s2[(I>>16)&0xff]) + s3[(I>>8)&0xff]) ^ s4[I&0xff])) #define F3(D, m, r) ((I = ((m) - (D))), (I = rol32(I, (r))), \ (((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff])) void __cast5_encrypt(struct cast5_ctx *c, u8 *outbuf, const u8 *inbuf) { u32 l, r, t; u32 I; /* used by the Fx macros */ u32 *Km; u8 *Kr; Km = c->Km; Kr = c->Kr; /* (L0,R0) <-- (m1...m64). (Split the plaintext into left and * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.) */ l = get_unaligned_be32(inbuf); r = get_unaligned_be32(inbuf + 4); /* (16 rounds) for i from 1 to 16, compute Li and Ri as follows: * Li = Ri-1; * Ri = Li-1 ^ f(Ri-1,Kmi,Kri), where f is defined in Section 2.2 * Rounds 1, 4, 7, 10, 13, and 16 use f function Type 1. * Rounds 2, 5, 8, 11, and 14 use f function Type 2. * Rounds 3, 6, 9, 12, and 15 use f function Type 3. */ t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); if (!(c->rr)) { t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]); t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]); t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]); t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]); } /* c1...c64 <-- (R16,L16). (Exchange final blocks L16, R16 and * concatenate to form the ciphertext.) */ put_unaligned_be32(r, outbuf); put_unaligned_be32(l, outbuf + 4); } EXPORT_SYMBOL_GPL(__cast5_encrypt); static void cast5_encrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf) { __cast5_encrypt(crypto_tfm_ctx(tfm), outbuf, inbuf); } void __cast5_decrypt(struct cast5_ctx *c, u8 *outbuf, const u8 *inbuf) { u32 l, r, t; u32 I; u32 *Km; u8 *Kr; Km = c->Km; Kr = c->Kr; l = get_unaligned_be32(inbuf); r = get_unaligned_be32(inbuf + 4); if (!(c->rr)) { t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]); t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]); t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]); t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]); } t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); put_unaligned_be32(r, outbuf); put_unaligned_be32(l, outbuf + 4); } EXPORT_SYMBOL_GPL(__cast5_decrypt); static void cast5_decrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf) { __cast5_decrypt(crypto_tfm_ctx(tfm), outbuf, inbuf); } static void key_schedule(u32 *x, u32 *z, u32 *k) { #define xi(i) ((x[(i)/4] >> (8*(3-((i)%4)))) & 0xff) #define zi(i) ((z[(i)/4] >> (8*(3-((i)%4)))) & 0xff) z[0] = x[0] ^ s5[xi(13)] ^ s6[xi(15)] ^ s7[xi(12)] ^ sb8[xi(14)] ^ s7[xi(8)]; z[1] = x[2] ^ s5[zi(0)] ^ s6[zi(2)] ^ s7[zi(1)] ^ sb8[zi(3)] ^ sb8[xi(10)]; z[2] = x[3] ^ s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ s5[xi(9)]; z[3] = x[1] ^ s5[zi(10)] ^ s6[zi(9)] ^ s7[zi(11)] ^ sb8[zi(8)] ^ s6[xi(11)]; k[0] = s5[zi(8)] ^ s6[zi(9)] ^ s7[zi(7)] ^ sb8[zi(6)] ^ s5[zi(2)]; k[1] = s5[zi(10)] ^ s6[zi(11)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ s6[zi(6)]; k[2] = s5[zi(12)] ^ s6[zi(13)] ^ s7[zi(3)] ^ sb8[zi(2)] ^ s7[zi(9)]; k[3] = s5[zi(14)] ^ s6[zi(15)] ^ s7[zi(1)] ^ sb8[zi(0)] ^ sb8[zi(12)]; x[0] = z[2] ^ s5[zi(5)] ^ s6[zi(7)] ^ s7[zi(4)] ^ sb8[zi(6)] ^ s7[zi(0)]; x[1] = z[0] ^ s5[xi(0)] ^ s6[xi(2)] ^ s7[xi(1)] ^ sb8[xi(3)] ^ sb8[zi(2)]; x[2] = z[1] ^ s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ s5[zi(1)]; x[3] = z[3] ^ s5[xi(10)] ^ s6[xi(9)] ^ s7[xi(11)] ^ sb8[xi(8)] ^ s6[zi(3)]; k[4] = s5[xi(3)] ^ s6[xi(2)] ^ s7[xi(12)] ^ sb8[xi(13)] ^ s5[xi(8)]; k[5] = s5[xi(1)] ^ s6[xi(0)] ^ s7[xi(14)] ^ sb8[xi(15)] ^ s6[xi(13)]; k[6] = s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(8)] ^ sb8[xi(9)] ^ s7[xi(3)]; k[7] = s5[xi(5)] ^ s6[xi(4)] ^ s7[xi(10)] ^ sb8[xi(11)] ^ sb8[xi(7)]; z[0] = x[0] ^ s5[xi(13)] ^ s6[xi(15)] ^ s7[xi(12)] ^ sb8[xi(14)] ^ s7[xi(8)]; z[1] = x[2] ^ s5[zi(0)] ^ s6[zi(2)] ^ s7[zi(1)] ^ sb8[zi(3)] ^ sb8[xi(10)]; z[2] = x[3] ^ s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ s5[xi(9)]; z[3] = x[1] ^ s5[zi(10)] ^ s6[zi(9)] ^ s7[zi(11)] ^ sb8[zi(8)] ^ s6[xi(11)]; k[8] = s5[zi(3)] ^ s6[zi(2)] ^ s7[zi(12)] ^ sb8[zi(13)] ^ s5[zi(9)]; k[9] = s5[zi(1)] ^ s6[zi(0)] ^ s7[zi(14)] ^ sb8[zi(15)] ^ s6[zi(12)]; k[10] = s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(8)] ^ sb8[zi(9)] ^ s7[zi(2)]; k[11] = s5[zi(5)] ^ s6[zi(4)] ^ s7[zi(10)] ^ sb8[zi(11)] ^ sb8[zi(6)]; x[0] = z[2] ^ s5[zi(5)] ^ s6[zi(7)] ^ s7[zi(4)] ^ sb8[zi(6)] ^ s7[zi(0)]; x[1] = z[0] ^ s5[xi(0)] ^ s6[xi(2)] ^ s7[xi(1)] ^ sb8[xi(3)] ^ sb8[zi(2)]; x[2] = z[1] ^ s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ s5[zi(1)]; x[3] = z[3] ^ s5[xi(10)] ^ s6[xi(9)] ^ s7[xi(11)] ^ sb8[xi(8)] ^ s6[zi(3)]; k[12] = s5[xi(8)] ^ s6[xi(9)] ^ s7[xi(7)] ^ sb8[xi(6)] ^ s5[xi(3)]; k[13] = s5[xi(10)] ^ s6[xi(11)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ s6[xi(7)]; k[14] = s5[xi(12)] ^ s6[xi(13)] ^ s7[xi(3)] ^ sb8[xi(2)] ^ s7[xi(8)]; k[15] = s5[xi(14)] ^ s6[xi(15)] ^ s7[xi(1)] ^ sb8[xi(0)] ^ sb8[xi(13)]; #undef xi #undef zi } int cast5_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { struct cast5_ctx *c = crypto_tfm_ctx(tfm); int i; u32 x[4]; u32 z[4]; u32 k[16]; __be32 p_key[4]; c->rr = key_len <= 10 ? 1 : 0; memset(p_key, 0, 16); memcpy(p_key, key, key_len); x[0] = be32_to_cpu(p_key[0]); x[1] = be32_to_cpu(p_key[1]); x[2] = be32_to_cpu(p_key[2]); x[3] = be32_to_cpu(p_key[3]); key_schedule(x, z, k); for (i = 0; i < 16; i++) c->Km[i] = k[i]; key_schedule(x, z, k); for (i = 0; i < 16; i++) c->Kr[i] = k[i] & 0x1f; return 0; } EXPORT_SYMBOL_GPL(cast5_setkey); static struct crypto_alg alg = { .cra_name = "cast5", .cra_driver_name = "cast5-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = CAST5_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = CAST5_MIN_KEY_SIZE, .cia_max_keysize = CAST5_MAX_KEY_SIZE, .cia_setkey = cast5_setkey, .cia_encrypt = cast5_encrypt, .cia_decrypt = cast5_decrypt } } }; static int __init cast5_mod_init(void) { return crypto_register_alg(&alg); } static void __exit cast5_mod_fini(void) { crypto_unregister_alg(&alg); } module_init(cast5_mod_init); module_exit(cast5_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cast5 Cipher Algorithm"); MODULE_ALIAS_CRYPTO("cast5"); MODULE_ALIAS_CRYPTO("cast5-generic"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/sctp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_sctp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kiran Kumar Immidi"); MODULE_DESCRIPTION("Xtables: SCTP protocol packet match"); MODULE_ALIAS("ipt_sctp"); MODULE_ALIAS("ip6t_sctp"); #define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ || (!!((invflag) & (option)) ^ (cond))) static bool match_flags(const struct xt_sctp_flag_info *flag_info, const int flag_count, u_int8_t chunktype, u_int8_t chunkflags) { int i; for (i = 0; i < flag_count; i++) if (flag_info[i].chunktype == chunktype) return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; return true; } static inline bool match_packet(const struct sk_buff *skb, unsigned int offset, const struct xt_sctp_info *info, bool *hotdrop) { u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; int chunk_match_type = info->chunk_match_type; const struct xt_sctp_flag_info *flag_info = info->flag_info; int flag_count = info->flag_count; #ifdef DEBUG int i = 0; #endif if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) SCTP_CHUNKMAP_COPY(chunkmapcopy, info->chunkmap); do { sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); if (sch == NULL || sch->length == 0) { pr_debug("Dropping invalid SCTP packet.\n"); *hotdrop = true; return false; } #ifdef DEBUG pr_debug("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d" "\tflags: %x\n", ++i, offset, sch->type, htons(sch->length), sch->flags); #endif offset += SCTP_PAD4(ntohs(sch->length)); pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); if (SCTP_CHUNKMAP_IS_SET(info->chunkmap, sch->type)) { switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ANY: if (match_flags(flag_info, flag_count, sch->type, sch->flags)) { return true; } break; case SCTP_CHUNK_MATCH_ALL: if (match_flags(flag_info, flag_count, sch->type, sch->flags)) SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); break; case SCTP_CHUNK_MATCH_ONLY: if (!match_flags(flag_info, flag_count, sch->type, sch->flags)) return false; break; } } else { switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ONLY: return false; } } } while (offset < skb->len); switch (chunk_match_type) { case SCTP_CHUNK_MATCH_ALL: return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy); case SCTP_CHUNK_MATCH_ANY: return false; case SCTP_CHUNK_MATCH_ONLY: return true; } /* This will never be reached, but required to stop compiler whine */ return false; } static bool sctp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_sctp_info *info = par->matchinfo; const struct sctphdr *sh; struct sctphdr _sh; if (par->fragoff != 0) { pr_debug("Dropping non-first fragment.. FIXME\n"); return false; } sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh); if (sh == NULL) { pr_debug("Dropping evil TCP offset=0 tinygram.\n"); par->hotdrop = true; return false; } pr_debug("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); return SCCHECK(ntohs(sh->source) >= info->spts[0] && ntohs(sh->source) <= info->spts[1], XT_SCTP_SRC_PORTS, info->flags, info->invflags) && SCCHECK(ntohs(sh->dest) >= info->dpts[0] && ntohs(sh->dest) <= info->dpts[1], XT_SCTP_DEST_PORTS, info->flags, info->invflags) && SCCHECK(match_packet(skb, par->thoff + sizeof(_sh), info, &par->hotdrop), XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } static int sctp_mt_check(const struct xt_mtchk_param *par) { const struct xt_sctp_info *info = par->matchinfo; if (info->flag_count > ARRAY_SIZE(info->flag_info)) return -EINVAL; if (info->flags & ~XT_SCTP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~XT_SCTP_VALID_FLAGS) return -EINVAL; if (info->invflags & ~info->flags) return -EINVAL; if (!(info->flags & XT_SCTP_CHUNK_TYPES)) return 0; if (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL | SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY)) return 0; return -EINVAL; } static struct xt_match sctp_mt_reg[] __read_mostly = { { .name = "sctp", .family = NFPROTO_IPV4, .checkentry = sctp_mt_check, .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), .proto = IPPROTO_SCTP, .me = THIS_MODULE }, { .name = "sctp", .family = NFPROTO_IPV6, .checkentry = sctp_mt_check, .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), .proto = IPPROTO_SCTP, .me = THIS_MODULE }, }; static int __init sctp_mt_init(void) { return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); } static void __exit sctp_mt_exit(void) { xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); } module_init(sctp_mt_init); module_exit(sctp_mt_exit); |
| 96 100 1 99 99 96 40 40 73 72 73 73 40 40 39 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | /* * net/tipc/subscr.c: TIPC network topology service * * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "name_table.h" #include "subscr.h" static void tipc_sub_send_event(struct tipc_subscription *sub, struct publication *p, u32 event) { struct tipc_subscr *s = &sub->evt.s; struct tipc_event *evt = &sub->evt; if (sub->inactive) return; tipc_evt_write(evt, event, event); if (p) { tipc_evt_write(evt, found_lower, p->sr.lower); tipc_evt_write(evt, found_upper, p->sr.upper); tipc_evt_write(evt, port.ref, p->sk.ref); tipc_evt_write(evt, port.node, p->sk.node); } else { tipc_evt_write(evt, found_lower, s->seq.lower); tipc_evt_write(evt, found_upper, s->seq.upper); tipc_evt_write(evt, port.ref, 0); tipc_evt_write(evt, port.node, 0); } tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt); } /** * tipc_sub_check_overlap - test for subscription overlap with the given values * @subscribed: the service range subscribed for * @found: the service range we are checking for match * * Returns true if there is overlap, otherwise false. */ static bool tipc_sub_check_overlap(struct tipc_service_range *subscribed, struct tipc_service_range *found) { u32 found_lower = found->lower; u32 found_upper = found->upper; if (found_lower < subscribed->lower) found_lower = subscribed->lower; if (found_upper > subscribed->upper) found_upper = subscribed->upper; return found_lower <= found_upper; } void tipc_sub_report_overlap(struct tipc_subscription *sub, struct publication *p, u32 event, bool must) { struct tipc_service_range *sr = &sub->s.seq; u32 filter = sub->s.filter; if (!tipc_sub_check_overlap(sr, &p->sr)) return; if (!must && !(filter & TIPC_SUB_PORTS)) return; if (filter & TIPC_SUB_CLUSTER_SCOPE && p->scope == TIPC_NODE_SCOPE) return; if (filter & TIPC_SUB_NODE_SCOPE && p->scope != TIPC_NODE_SCOPE) return; spin_lock(&sub->lock); tipc_sub_send_event(sub, p, event); spin_unlock(&sub->lock); } static void tipc_sub_timeout(struct timer_list *t) { struct tipc_subscription *sub = timer_container_of(sub, t, timer); spin_lock(&sub->lock); tipc_sub_send_event(sub, NULL, TIPC_SUBSCR_TIMEOUT); sub->inactive = true; spin_unlock(&sub->lock); } static void tipc_sub_kref_release(struct kref *kref) { kfree(container_of(kref, struct tipc_subscription, kref)); } void tipc_sub_put(struct tipc_subscription *subscription) { kref_put(&subscription->kref, tipc_sub_kref_release); } void tipc_sub_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid) { u32 lower = tipc_sub_read(s, seq.lower); u32 upper = tipc_sub_read(s, seq.upper); u32 filter = tipc_sub_read(s, filter); struct tipc_subscription *sub; u32 timeout; if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || lower > upper) { pr_warn("Subscription rejected, illegal request\n"); return NULL; } sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); return NULL; } INIT_LIST_HEAD(&sub->service_list); INIT_LIST_HEAD(&sub->sub_list); sub->net = net; sub->conid = conid; sub->inactive = false; memcpy(&sub->evt.s, s, sizeof(*s)); sub->s.seq.type = tipc_sub_read(s, seq.type); sub->s.seq.lower = lower; sub->s.seq.upper = upper; sub->s.filter = filter; sub->s.timeout = tipc_sub_read(s, timeout); memcpy(sub->s.usr_handle, s->usr_handle, 8); spin_lock_init(&sub->lock); kref_init(&sub->kref); if (!tipc_nametbl_subscribe(sub)) { kfree(sub); return NULL; } timer_setup(&sub->timer, tipc_sub_timeout, 0); timeout = tipc_sub_read(&sub->evt.s, timeout); if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); return sub; } void tipc_sub_unsubscribe(struct tipc_subscription *sub) { tipc_nametbl_unsubscribe(sub); if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) timer_delete_sync(&sub->timer); list_del(&sub->sub_list); tipc_sub_put(sub); } |
| 46 167 206 2 2 5 3 3 2 1 3 1 1 243 8 3 19 169 168 3 1 17 57 124 188 46 254 254 254 10 7 4 252 227 156 253 2 2 2 52 29 190 132 70 119 25 189 171 18 2 18 18 189 23 191 26 46 91 133 38 254 84 144 46 251 206 26 124 147 128 6 14 57 14 85 86 40 9 168 89 50 57 57 57 48 11 3 10 10 4 6 5 5 2 8 4 3 3 2 2 15 2 9 2 7 2 1 14 47 47 5 5 45 11 4 58 15 42 47 9 49 6 53 18 35 42 5 45 2 47 12 35 43 4 44 3 44 3 43 4 44 3 41 6 45 2 42 5 37 10 3 43 53 57 1 45 15 53 47 7 30 23 30 3 27 30 21 5 6 5 6 10 30 2 2 17 17 23 27 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_netem.c Network emulator * * Many of the algorithms and ideas for this came from * NIST Net which is not copyrighted. * * Authors: Stephen Hemminger <shemminger@osdl.org> * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> */ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/prandom.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> #include <linux/rbtree.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #define VERSION "1.3" /* Network Emulation Queuing algorithm. ==================================== Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based Network Emulation Tool [2] Luigi Rizzo, DummyNet for FreeBSD ---------------------------------------------------------------- This started out as a simple way to delay outgoing packets to test TCP but has grown to include most of the functionality of a full blown network emulator like NISTnet. It can delay packets and add random jitter (and correlation). The random distribution can be loaded from a table as well to provide normal, Pareto, or experimental curves. Packet loss, duplication, and reordering can also be emulated. This qdisc does not do classification that can be handled in layering other disciplines. It does not need to do bandwidth control either since that can be handled by using token bucket or other rate control. Correlated Loss Generator models Added generation of correlated loss according to the "Gilbert-Elliot" model, a 4-state markov model. References: [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general and intuitive loss model for packet networks and its implementation in the Netem module in the Linux kernel", available in [1] Authors: Stefano Salsano <stefano.salsano at uniroma2.it Fabio Ludovici <fabio.ludovici at yahoo.it> */ struct disttable { u32 size; s16 table[] __counted_by(size); }; struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; /* a linear queue; reduces rbtree rebalancing when jitter is low */ struct sk_buff *t_head; struct sk_buff *t_tail; u32 t_len; /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; struct qdisc_watchdog watchdog; s64 latency; s64 jitter; u32 loss; u32 ecn; u32 limit; u32 counter; u32 gap; u32 duplicate; u32 reorder; u32 corrupt; u64 rate; s32 packet_overhead; u32 cell_size; struct reciprocal_value cell_size_reciprocal; s32 cell_overhead; struct crndstate { u32 last; u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct prng { u64 seed; struct rnd_state prng_state; } prng; struct disttable *delay_dist; enum { CLG_RANDOM, CLG_4_STATES, CLG_GILB_ELL, } loss_model; enum { TX_IN_GAP_PERIOD = 1, TX_IN_BURST_PERIOD, LOST_IN_GAP_PERIOD, LOST_IN_BURST_PERIOD, } _4_state_model; enum { GOOD_STATE = 1, BAD_STATE, } GE_state_model; /* Correlated Loss Generation models */ struct clgstate { /* state of the Markov chain */ u8 state; /* 4-states and Gilbert-Elliot models */ u32 a1; /* p13 for 4-states or p for GE */ u32 a2; /* p31 for 4-states or r for GE */ u32 a3; /* p32 for 4-states or h for GE */ u32 a4; /* p14 for 4-states or 1-k for GE */ u32 a5; /* p23 used only in 4-states */ } clg; struct tc_netem_slot slot_config; struct slotstate { u64 slot_next; s32 packets_left; s32 bytes_left; } slot; struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block * Only valid when skbs are in our internal t(ime)fifo queue. * * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, * and skb->next & skb->prev are scratch space for a qdisc, * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { u64 time_to_send; }; static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } /* init_crandom - initialize correlated random number generator * Use entropy source for initial seed. */ static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; state->last = get_random_u32(); } /* get_crandom - correlated random number generator * Next number depends on last value. * rho is scaled to avoid floating point. */ static u32 get_crandom(struct crndstate *state, struct prng *p) { u64 value, rho; unsigned long answer; struct rnd_state *s = &p->prng_state; if (!state || state->rho == 0) /* no correlation */ return prandom_u32_state(s); value = prandom_u32_state(s); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; return answer; } /* loss_4state - 4-state model loss generator * Generates losses according to the 4-state Markov chain adopted in * the GI (General and Intuitive) loss model. */ static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; u32 rnd = prandom_u32_state(&q->prng.prng_state); /* * Makes a comparison between rnd and the transition * probabilities outgoing from the current state, then decides the * next state and if the next packet has to be transmitted or lost. * The four states correspond to: * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period * LOST_IN_GAP_PERIOD => isolated losses within a gap period * LOST_IN_BURST_PERIOD => lost packets within a burst period * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period */ switch (clg->state) { case TX_IN_GAP_PERIOD: if (rnd < clg->a4) { clg->state = LOST_IN_GAP_PERIOD; return true; } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { clg->state = LOST_IN_BURST_PERIOD; return true; } else if (clg->a1 + clg->a4 < rnd) { clg->state = TX_IN_GAP_PERIOD; } break; case TX_IN_BURST_PERIOD: if (rnd < clg->a5) { clg->state = LOST_IN_BURST_PERIOD; return true; } else { clg->state = TX_IN_BURST_PERIOD; } break; case LOST_IN_BURST_PERIOD: if (rnd < clg->a3) clg->state = TX_IN_BURST_PERIOD; else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { clg->state = TX_IN_GAP_PERIOD; } else if (clg->a2 + clg->a3 < rnd) { clg->state = LOST_IN_BURST_PERIOD; return true; } break; case LOST_IN_GAP_PERIOD: clg->state = TX_IN_GAP_PERIOD; break; } return false; } /* loss_gilb_ell - Gilbert-Elliot model loss generator * Generates losses according to the Gilbert-Elliot loss model or * its special cases (Gilbert or Simple Gilbert) * * Makes a comparison between random number and the transition * probabilities outgoing from the current state, then decides the * next state. A second random number is extracted and the comparison * with the loss probability of the current state decides if the next * packet will be transmitted or lost. */ static bool loss_gilb_ell(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; struct rnd_state *s = &q->prng.prng_state; switch (clg->state) { case GOOD_STATE: if (prandom_u32_state(s) < clg->a1) clg->state = BAD_STATE; if (prandom_u32_state(s) < clg->a4) return true; break; case BAD_STATE: if (prandom_u32_state(s) < clg->a2) clg->state = GOOD_STATE; if (prandom_u32_state(s) > clg->a3) return true; } return false; } static bool loss_event(struct netem_sched_data *q) { switch (q->loss_model) { case CLG_RANDOM: /* Random packet drop 0 => none, ~0 => all */ return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); case CLG_4_STATES: /* 4state loss model algorithm (used also for GI model) * Extracts a value from the markov 4 state loss generator, * if it is 1 drops a packet and if needed writes the event in * the kernel logs */ return loss_4state(q); case CLG_GILB_ELL: /* Gilbert-Elliot loss model algorithm * Extracts a value from the Gilbert-Elliot loss generator, * if it is 1 drops a packet and if needed writes the event in * the kernel logs */ return loss_gilb_ell(q); } return false; /* not reached */ } /* tabledist - return a pseudo-randomly distributed value with mean mu and * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ static s64 tabledist(s64 mu, s32 sigma, struct crndstate *state, struct prng *prng, const struct disttable *dist) { s64 x; long t; u32 rnd; if (sigma == 0) return mu; rnd = get_crandom(state, prng); /* default uniform distribution */ if (dist == NULL) return ((rnd % (2 * (u32)sigma)) + mu) - sigma; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; if (x >= 0) x += NETEM_DIST_SCALE/2; else x -= NETEM_DIST_SCALE/2; return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) { len += q->packet_overhead; if (q->cell_size) { u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); if (len > cells * q->cell_size) /* extra cell needed for remainder */ cells++; len = cells * (q->cell_size + q->cell_overhead); } return div64_u64(len * NSEC_PER_SEC, q->rate); } static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } rtnl_kfree_skbs(q->t_head, q->t_tail); q->t_head = NULL; q->t_tail = NULL; q->t_len = 0; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); u64 tnext = netem_skb_cb(nskb)->time_to_send; if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { if (q->t_tail) q->t_tail->next = nskb; else q->t_head = nskb; q->t_tail = nskb; } else { struct rb_node **p = &q->t_root.rb_node, *parent = NULL; while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color(&nskb->rbnode, &q->t_root); } q->t_len++; sch->q.qlen++; } /* netem can't properly corrupt a megapacket (like we get from GSO), so instead * when we statistically choose to corrupt one, we instead segment it, returning * the first packet to be corrupted, and re-enqueue the remaining frames */ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { qdisc_drop(skb, sch, to_free); return NULL; } consume_skb(skb); return segs; } /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2 = NULL; struct sk_buff *segs = NULL; unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ if (loss_event(q)) { if (q->ecn && INET_ECN_set_ce(skb)) qdisc_qstats_drop(sch); /* mark packet */ else --count; } if (count == 0) { qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) */ if (q->latency || q->jitter || q->rate) skb_orphan_partial(skb); /* * If we need to duplicate packet, then clone it before * original is modified. */ if (count > 1) skb2 = skb_clone(skb, GFP_ATOMIC); /* * Randomized packet corruption. * Make copy if needed since we are modifying * If packet is going to be hardware checksummed, then * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) goto finish_segs; segs = skb->next; skb_mark_not_on_list(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; } skb = skb_unshare(skb, GFP_ATOMIC); if (unlikely(!skb)) { qdisc_qstats_drop(sch); goto finish_segs; } if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) { qdisc_drop(skb, sch, to_free); skb = NULL; goto finish_segs; } skb->data[get_random_u32_below(skb_headlen(skb))] ^= 1<<get_random_u32_below(8); } if (unlikely(q->t_len >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); if (skb2) __qdisc_drop(skb2, to_free); return NET_XMIT_DROP; } /* * If doing duplication then re-insert at top of the * qdisc tree, since parent queuer expects that only one * skb will be queued. */ if (skb2) { struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; skb2 = NULL; } qdisc_qstats_backlog_inc(sch, skb); cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { u64 now; s64 delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, &q->prng, q->delay_dist); now = ktime_get_ns(); if (q->rate) { struct netem_skb_cb *last = NULL; if (sch->q.tail) last = netem_skb_cb(sch->q.tail); if (q->t_root.rb_node) { struct sk_buff *t_skb; struct netem_skb_cb *t_last; t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) last = t_last; } if (q->t_tail) { struct netem_skb_cb *t_last = netem_skb_cb(q->t_tail); if (!last || t_last->time_to_send > last->time_to_send) last = t_last; } if (last) { /* * Last packet in queue is reference point (now), * calculate this time bonus and subtract * from delay. */ delay -= last->time_to_send - now; delay = max_t(s64, 0, delay); now = last->time_to_send; } delay += packet_time_ns(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; ++q->counter; tfifo_enqueue(skb, sch); } else { /* * Do re-ordering by putting one out of N packets at the front * of the queue. */ cb->time_to_send = ktime_get_ns(); q->counter = 0; __qdisc_enqueue_head(skb, &sch->q); sch->qstats.requeues++; } finish_segs: if (skb2) __qdisc_drop(skb2, to_free); if (segs) { unsigned int len, last_len; int rc, nb; len = skb ? skb->len : 0; nb = skb ? 1 : 0; while (segs) { skb2 = segs->next; skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; rc = qdisc_enqueue(segs, sch, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); } else { nb++; len += last_len; } segs = skb2; } /* Parent qdiscs accounted for 1 skb of size @prev_len */ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); } else if (!skb) { return NET_XMIT_DROP; } return NET_XMIT_SUCCESS; } /* Delay the next round with a new future slot with a * correct number of bytes and packets. */ static void get_slot_next(struct netem_sched_data *q, u64 now) { s64 next_delay; if (!q->slot_dist) next_delay = q->slot_config.min_delay + (get_random_u32() * (q->slot_config.max_delay - q->slot_config.min_delay) >> 32); else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), NULL, &q->prng, q->slot_dist); q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } static struct sk_buff *netem_peek(struct netem_sched_data *q) { struct sk_buff *skb = skb_rb_first(&q->t_root); u64 t1, t2; if (!skb) return q->t_head; if (!q->t_head) return skb; t1 = netem_skb_cb(skb)->time_to_send; t2 = netem_skb_cb(q->t_head)->time_to_send; if (t1 < t2) return skb; return q->t_head; } static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) { if (skb == q->t_head) { q->t_head = skb->next; if (!q->t_head) q->t_tail = NULL; } else { rb_erase(&skb->rbnode, &q->t_root); } } static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); if (skb) { deliver: qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; } skb = netem_peek(q); if (skb) { u64 time_to_send; u64 now = ktime_get_ns(); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; if (q->slot.slot_next && q->slot.slot_next < time_to_send) get_slot_next(q, now); if (time_to_send <= now && q->slot.slot_next <= now) { netem_erase_head(q, skb); q->t_len--; skb->next = NULL; skb->prev = NULL; /* skb->dev shares skb->rbnode area, * we need to restore its value. */ skb->dev = qdisc_dev(sch); if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); if (q->slot.packets_left <= 0 || q->slot.bytes_left <= 0) get_slot_next(q, now); } if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; int err; err = qdisc_enqueue(skb, q->qdisc, &to_free); kfree_skb_list(to_free); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); sch->qstats.backlog -= pkt_len; sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } sch->q.qlen--; goto deliver; } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) { sch->q.qlen--; goto deliver; } } qdisc_watchdog_schedule_ns(&q->watchdog, max(time_to_send, q->slot.slot_next)); } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) { sch->q.qlen--; goto deliver; } } return NULL; } static void netem_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); tfifo_reset(sch); if (q->qdisc) qdisc_reset(q->qdisc); qdisc_watchdog_cancel(&q->watchdog); } static void dist_free(struct disttable *d) { kvfree(d); } /* * Distribution data is a variable size payload containing * signed 16 bit values. */ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) { size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); struct disttable *d; int i; if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); if (!d) return -ENOMEM; d->size = n; for (i = 0; i < n; i++) d->table[i] = data[i]; *tbl = d; return 0; } static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_slot *c = nla_data(attr); q->slot_config = *c; if (q->slot_config.max_packets == 0) q->slot_config.max_packets = INT_MAX; if (q->slot_config.max_bytes == 0) q->slot_config.max_bytes = INT_MAX; /* capping dist_jitter to the range acceptable by tabledist() */ q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; if (q->slot_config.min_delay | q->slot_config.max_delay | q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; } static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); init_crandom(&q->loss_cor, c->loss_corr); init_crandom(&q->dup_cor, c->dup_corr); } static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); } static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); } static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_rate *r = nla_data(attr); q->rate = r->rate; q->packet_overhead = r->packet_overhead; q->cell_size = r->cell_size; q->cell_overhead = r->cell_overhead; if (q->cell_size) q->cell_size_reciprocal = reciprocal_value(q->cell_size); else q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; } static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) { const struct nlattr *la; int rem; nla_for_each_nested(la, attr, rem) { u16 type = nla_type(la); switch (type) { case NETEM_LOSS_GI: { const struct tc_netem_gimodel *gi = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { pr_info("netem: incorrect gi model size\n"); return -EINVAL; } q->loss_model = CLG_4_STATES; q->clg.state = TX_IN_GAP_PERIOD; q->clg.a1 = gi->p13; q->clg.a2 = gi->p31; q->clg.a3 = gi->p32; q->clg.a4 = gi->p14; q->clg.a5 = gi->p23; break; } case NETEM_LOSS_GE: { const struct tc_netem_gemodel *ge = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { pr_info("netem: incorrect ge model size\n"); return -EINVAL; } q->loss_model = CLG_GILB_ELL; q->clg.state = GOOD_STATE; q->clg.a1 = ge->p; q->clg.a2 = ge->r; q->clg.a3 = ge->h; q->clg.a4 = ge->k1; break; } default: pr_info("netem: unknown loss type %u\n", type); return -EINVAL; } } return 0; } static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, [TCA_NETEM_RATE64] = { .type = NLA_U64 }, [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, const struct nla_policy *policy, int len) { int nested_len = nla_len(nla) - NLA_ALIGN(len); if (nested_len < 0) { pr_info("netem: invalid attributes len %d\n", nested_len); return -EINVAL; } if (nested_len >= nla_attr_size(0)) return nla_parse_deprecated(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), nested_len, policy, NULL); memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; } static const struct Qdisc_class_ops netem_class_ops; static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, struct netlink_ext_ack *extack) { struct Qdisc *root, *q; unsigned int i; root = qdisc_root_sleeping(sch); if (sch != root && root->ops->cl_ops == &netem_class_ops) { if (duplicates || ((struct netem_sched_data *)qdisc_priv(root))->duplicate) goto err; } if (!qdisc_dev(root)) return 0; hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { if (sch != q && q->ops->cl_ops == &netem_class_ops) { if (duplicates || ((struct netem_sched_data *)qdisc_priv(q))->duplicate) goto err; } } return 0; err: NL_SET_ERR_MSG(extack, "netem: cannot mix duplicating netems with other netems in tree"); return -EINVAL; } /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; struct disttable *delay_dist = NULL; struct disttable *slot_dist = NULL; struct tc_netem_qopt *qopt; struct clgstate old_clg; int old_loss_model = CLG_RANDOM; int ret; qopt = nla_data(opt); ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); if (ret < 0) return ret; if (tb[TCA_NETEM_DELAY_DIST]) { ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); if (ret) goto table_free; } if (tb[TCA_NETEM_SLOT_DIST]) { ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); if (ret) goto table_free; } sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; if (tb[TCA_NETEM_LOSS]) { ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; q->clg = old_clg; goto unlock; } } else { q->loss_model = CLG_RANDOM; } if (delay_dist) swap(q->delay_dist, delay_dist); if (slot_dist) swap(q->slot_dist, slot_dist); sch->limit = qopt->limit; q->latency = PSCHED_TICKS2NS(qopt->latency); q->jitter = PSCHED_TICKS2NS(qopt->jitter); q->limit = qopt->limit; q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; ret = check_netem_in_tree(sch, qopt->duplicate, extack); if (ret) goto unlock; q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. * if gap is set, need to assume 100% probability */ if (q->gap) q->reorder = ~0; if (tb[TCA_NETEM_CORR]) get_correlation(q, tb[TCA_NETEM_CORR]); if (tb[TCA_NETEM_REORDER]) get_reorder(q, tb[TCA_NETEM_REORDER]); if (tb[TCA_NETEM_CORRUPT]) get_corrupt(q, tb[TCA_NETEM_CORRUPT]); if (tb[TCA_NETEM_RATE]) get_rate(q, tb[TCA_NETEM_RATE]); if (tb[TCA_NETEM_RATE64]) q->rate = max_t(u64, q->rate, nla_get_u64(tb[TCA_NETEM_RATE64])); if (tb[TCA_NETEM_LATENCY64]) q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); if (tb[TCA_NETEM_JITTER64]) q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); if (tb[TCA_NETEM_SLOT]) get_slot(q, tb[TCA_NETEM_SLOT]); /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); if (tb[TCA_NETEM_PRNG_SEED]) q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); else q->prng.seed = get_random_u64(); prandom_seed_state(&q->prng.prng_state, q->prng.seed); unlock: sch_tree_unlock(sch); table_free: dist_free(delay_dist); dist_free(slot_dist); return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); int ret; qdisc_watchdog_init(&q->watchdog, sch); if (!opt) return -EINVAL; q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); return ret; } static void netem_destroy(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); if (q->qdisc) qdisc_put(q->qdisc); dist_free(q->delay_dist); dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, struct sk_buff *skb) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); if (nest == NULL) goto nla_put_failure; switch (q->loss_model) { case CLG_RANDOM: /* legacy loss model */ nla_nest_cancel(skb, nest); return 0; /* no data */ case CLG_4_STATES: { struct tc_netem_gimodel gi = { .p13 = q->clg.a1, .p31 = q->clg.a2, .p32 = q->clg.a3, .p14 = q->clg.a4, .p23 = q->clg.a5, }; if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) goto nla_put_failure; break; } case CLG_GILB_ELL: { struct tc_netem_gemodel ge = { .p = q->clg.a1, .r = q->clg.a2, .h = q->clg.a3, .k1 = q->clg.a4, }; if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) goto nla_put_failure; break; } } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; struct tc_netem_slot slot; qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), UINT_MAX); qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) goto nla_put_failure; if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) goto nla_put_failure; cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) goto nla_put_failure; reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) goto nla_put_failure; corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) goto nla_put_failure; if (q->rate >= (1ULL << 32)) { if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, TCA_NETEM_PAD)) goto nla_put_failure; rate.rate = ~0U; } else { rate.rate = q->rate; } rate.packet_overhead = q->packet_overhead; rate.cell_size = q->cell_size; rate.cell_overhead = q->cell_overhead; if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) goto nla_put_failure; if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) goto nla_put_failure; if (dump_loss_model(q, skb) != 0) goto nla_put_failure; if (q->slot_config.min_delay | q->slot_config.max_delay | q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; if (slot.max_bytes == INT_MAX) slot.max_bytes = 0; if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) goto nla_put_failure; } if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, TCA_NETEM_PAD)) goto nla_put_failure; return nla_nest_end(skb, nla); nla_put_failure: nlmsg_trim(skb, nla); return -1; } static int netem_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct netem_sched_data *q = qdisc_priv(sch); if (cl != 1 || !q->qdisc) /* only one class */ return -ENOENT; tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; return 0; } static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) { struct netem_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long netem_find(struct Qdisc *sch, u32 classid) { return 1; } static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { if (!tc_qdisc_stats_dump(sch, 1, walker)) return; } } static const struct Qdisc_class_ops netem_class_ops = { .graft = netem_graft, .leaf = netem_leaf, .find = netem_find, .walk = netem_walk, .dump = netem_dump_class, }; static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .id = "netem", .cl_ops = &netem_class_ops, .priv_size = sizeof(struct netem_sched_data), .enqueue = netem_enqueue, .dequeue = netem_dequeue, .peek = qdisc_peek_dequeued, .init = netem_init, .reset = netem_reset, .destroy = netem_destroy, .change = netem_change, .dump = netem_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("netem"); static int __init netem_module_init(void) { pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void) { unregister_qdisc(&netem_qdisc_ops); } module_init(netem_module_init) module_exit(netem_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network characteristics emulator qdisc"); |
| 1 1 1 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ECB: Electronic CodeBook mode * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> static int crypto_ecb_crypt(struct crypto_cipher *cipher, const u8 *src, u8 *dst, unsigned nbytes, bool final, void (*fn)(struct crypto_tfm *, u8 *, const u8 *)) { const unsigned int bsize = crypto_cipher_blocksize(cipher); while (nbytes >= bsize) { fn(crypto_cipher_tfm(cipher), dst, src); src += bsize; dst += bsize; nbytes -= bsize; } return nbytes && final ? -EINVAL : nbytes; } static int crypto_ecb_encrypt2(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv, u32 flags) { struct crypto_cipher **ctx = crypto_lskcipher_ctx(tfm); struct crypto_cipher *cipher = *ctx; return crypto_ecb_crypt(cipher, src, dst, len, flags & CRYPTO_LSKCIPHER_FLAG_FINAL, crypto_cipher_alg(cipher)->cia_encrypt); } static int crypto_ecb_decrypt2(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *iv, u32 flags) { struct crypto_cipher **ctx = crypto_lskcipher_ctx(tfm); struct crypto_cipher *cipher = *ctx; return crypto_ecb_crypt(cipher, src, dst, len, flags & CRYPTO_LSKCIPHER_FLAG_FINAL, crypto_cipher_alg(cipher)->cia_decrypt); } static int lskcipher_setkey_simple2(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher **ctx = crypto_lskcipher_ctx(tfm); struct crypto_cipher *cipher = *ctx; crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_lskcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(cipher, key, keylen); } static int lskcipher_init_tfm_simple2(struct crypto_lskcipher *tfm) { struct lskcipher_instance *inst = lskcipher_alg_instance(tfm); struct crypto_cipher **ctx = crypto_lskcipher_ctx(tfm); struct crypto_cipher_spawn *spawn; struct crypto_cipher *cipher; spawn = lskcipher_instance_ctx(inst); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); *ctx = cipher; return 0; } static void lskcipher_exit_tfm_simple2(struct crypto_lskcipher *tfm) { struct crypto_cipher **ctx = crypto_lskcipher_ctx(tfm); crypto_free_cipher(*ctx); } static void lskcipher_free_instance_simple2(struct lskcipher_instance *inst) { crypto_drop_cipher(lskcipher_instance_ctx(inst)); kfree(inst); } static struct lskcipher_instance *lskcipher_alloc_instance_simple2( struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_cipher_spawn *spawn; struct lskcipher_instance *inst; struct crypto_alg *cipher_alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_LSKCIPHER, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = lskcipher_instance_ctx(inst); err = crypto_grab_cipher(spawn, lskcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; cipher_alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(lskcipher_crypto_instance(inst), tmpl->name, cipher_alg); if (err) goto err_free_inst; inst->free = lskcipher_free_instance_simple2; /* Default algorithm properties, can be overridden */ inst->alg.co.base.cra_blocksize = cipher_alg->cra_blocksize; inst->alg.co.base.cra_alignmask = cipher_alg->cra_alignmask; inst->alg.co.base.cra_priority = cipher_alg->cra_priority; inst->alg.co.min_keysize = cipher_alg->cra_cipher.cia_min_keysize; inst->alg.co.max_keysize = cipher_alg->cra_cipher.cia_max_keysize; inst->alg.co.ivsize = cipher_alg->cra_blocksize; /* Use struct crypto_cipher * by default, can be overridden */ inst->alg.co.base.cra_ctxsize = sizeof(struct crypto_cipher *); inst->alg.setkey = lskcipher_setkey_simple2; inst->alg.init = lskcipher_init_tfm_simple2; inst->alg.exit = lskcipher_exit_tfm_simple2; return inst; err_free_inst: lskcipher_free_instance_simple2(inst); return ERR_PTR(err); } static int crypto_ecb_create2(struct crypto_template *tmpl, struct rtattr **tb) { struct lskcipher_instance *inst; int err; inst = lskcipher_alloc_instance_simple2(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); /* ECB mode doesn't take an IV */ inst->alg.co.ivsize = 0; inst->alg.encrypt = crypto_ecb_encrypt2; inst->alg.decrypt = crypto_ecb_decrypt2; err = lskcipher_register_instance(tmpl, inst); if (err) inst->free(inst); return err; } static int crypto_ecb_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_lskcipher_spawn *spawn; struct lskcipher_alg *cipher_alg; struct lskcipher_instance *inst; int err; inst = lskcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) { err = crypto_ecb_create2(tmpl, tb); return err; } spawn = lskcipher_instance_ctx(inst); cipher_alg = crypto_lskcipher_spawn_alg(spawn); /* ECB mode doesn't take an IV */ inst->alg.co.ivsize = 0; if (cipher_alg->co.ivsize) return -EINVAL; inst->alg.co.base.cra_ctxsize = cipher_alg->co.base.cra_ctxsize; inst->alg.setkey = cipher_alg->setkey; inst->alg.encrypt = cipher_alg->encrypt; inst->alg.decrypt = cipher_alg->decrypt; inst->alg.init = cipher_alg->init; inst->alg.exit = cipher_alg->exit; err = lskcipher_register_instance(tmpl, inst); if (err) inst->free(inst); return err; } static struct crypto_template crypto_ecb_tmpl = { .name = "ecb", .create = crypto_ecb_create, .module = THIS_MODULE, }; static int __init crypto_ecb_module_init(void) { return crypto_register_template(&crypto_ecb_tmpl); } static void __exit crypto_ecb_module_exit(void) { crypto_unregister_template(&crypto_ecb_tmpl); } module_init(crypto_ecb_module_init); module_exit(crypto_ecb_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ECB block cipher mode of operation"); MODULE_ALIAS_CRYPTO("ecb"); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); |
| 21 22 13 9 56 56 1 4 56 52 51 52 52 30 22 6 34 3 2 33 34 34 33 5 234 24 229 232 2 4 4 4 4 2 2 2 2 2 2 37 2 37 18 16 2 15 38 11 6 1 4 9 49 48 49 96 6 102 102 102 470 350 37 37 35 44 191 12 44 677 699 621 128 622 623 5 621 623 622 305 127 201 1 1 304 304 5 10 301 302 7 294 5 257 257 256 257 257 5 1 1 9 3 12 2 3 2 16 37 15 1 21 37 37 37 32 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions handle all input from the IP layer into SCTP. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/snmp.h> #include <net/sock.h> #include <net/xfrm.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> #include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *daddr, int dif, int sdif); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif); static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); /* Calculate the SCTP checksum of an SCTP packet. */ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) { struct sctphdr *sh = sctp_hdr(skb); __le32 cmp = sh->checksum; __le32 val = sctp_compute_cksum(skb, 0); if (val != cmp) { /* CRC failure, dump it. */ __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; } /* * This is the routine which IP calls when receiving an SCTP packet. */ int sctp_rcv(struct sk_buff *skb) { struct sock *sk; struct sctp_association *asoc; struct sctp_endpoint *ep = NULL; struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); int dif, sdif; if (skb->pkt_type != PACKET_HOST) goto discard_it; __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); /* If packet is too small to contain a single chunk, let's not * waste time on it anymore. */ if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + skb_transport_offset(skb)) goto discard_it; /* If the packet is fragmented and we need to do crc checking, * it's better to just linearize it otherwise crc computing * takes longer. */ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); else if (!sctp_checksum_disable && !is_gso && sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); af->from_skb(&dest, skb, 0); dif = af->skb_iif(skb); sdif = af->skb_sdif(skb); /* If the packet is to or from a non-unicast address, * silently discard the packet. * * This is not clearly defined in the RFC except in section * 8.4 - OOTB handling. However, based on the book "Stream Control * Transmission Protocol" 2.1, "It is important to note that the * IP address of an SCTP transport address must be a routable * unicast address. In other words, IP multicast addresses and * IP broadcast addresses cannot be used in an SCTP transport * address." */ if (!af->addr_valid(&src, NULL, skb) || !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif); if (!asoc) ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) * packet if it is correctly formed, i.e., passed the * receiver's checksum check, but the receiver is not * able to identify the association to which this * packet belongs. */ if (!asoc) { if (sctp_rcv_ootb(skb)) { __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; /* Create an SCTP packet structure. */ chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; /* Remember the SCTP header. */ chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); /* Remember where we came from. */ chunk->transport = transport; /* Acquire access to the sock lock. Note: We are safe from other * bottom halves on this lock, but a user may be in the lock too, * so check if it is busy. */ bh_lock_sock(sk); if (sk != rcvr->sk) { /* Our cached sk is different from the rcvr->sk. This is * because migrate()/accept() may have moved the association * to a new socket and released all the sockets. So now we * are holding a lock on the old socket while the user may * be doing something with the new socket. Switch our veiw * of the current sk. */ bh_unlock_sock(sk); sk = rcvr->sk; bh_lock_sock(sk); } if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); return 0; discard_it: __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); goto discard_it; } /* Process the backlog queue of the socket. Every skb on * the backlog holds a ref on an association or endpoint. * We hold this ref throughout the state machine to make * sure that the structure we need is still around. */ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; rcvr = chunk->rcvr; /* If the rcvr is dead then the association or endpoint * has been deleted and we can safely drop the chunk * and refs that we are holding. */ if (rcvr->dead) { sctp_chunk_free(chunk); goto done; } if (unlikely(rcvr->sk != sk)) { /* In this case, the association moved from one socket to * another. We are currently sitting on the backlog of the * old socket, so we need to move. * However, since we are here in the process context we * need to take make sure that the user doesn't own * the new socket when we process the packet. * If the new socket is user-owned, queue the chunk to the * backlog of the new socket without dropping any refs. * Otherwise, we can safely push the chunk on the inqueue. */ sk = rcvr->sk; local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; } else sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) return 0; } else { if (!sctp_newsk_ready(sk)) { if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { sctp_inq_push(inqueue, chunk); } } done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else BUG(); return 0; } static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else BUG(); } return ret; } /* Handle icmp frag needed error. */ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { if (!t || (t->pathmtu <= pmtu && t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; } if (!(t->param_flags & SPP_PMTUD_ENABLE)) /* We can't allow retransmitting in such case, as the * retransmission would be sized just as before, and thus we * would get another icmp, and retransmit again. */ return; /* Update transports view of the MTU. Return if no update was needed. * If an update wasn't needed/possible, it also doesn't make sense to * try to retransmit now. */ if (!sctp_transport_update_pmtu(t, pmtu)) return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, struct sk_buff *skb) { struct dst_entry *dst; if (sock_owned_by_user(sk) || !t) return; dst = sctp_transport_dst_check(t); if (dst) dst->ops->redirect(dst, sk, skb); } /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" * or a "Protocol Unreachable" treat this message as an abort * with the T bit set. * * This function sends an event to the state machine, which will abort the * association. * */ void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t) { if (sock_owned_by_user(sk)) { if (timer_pending(&t->proto_unreach_timer)) return; else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); pr_debug("%s: unrecognized next header type " "encountered!\n", __func__); if (timer_delete(&t->proto_unreach_timer)) sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, t, GFP_ATOMIC); } } /* Common lookup code for icmp/icmpv6 error handler. */ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctphdr *sctphdr, struct sctp_association **app, struct sctp_transport **tpp) { struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; __u32 vtag = ntohl(sctphdr->vtag); int sdif = inet_sdif(skb); int dif = inet_iif(skb); *app = NULL; *tpp = NULL; af = sctp_get_af_specific(family); if (unlikely(!af)) { return NULL; } /* Initialize local addresses for lookups. */ af->from_skb(&saddr, skb, 1); af->from_skb(&daddr, skb, 0); /* Look for an association that matches the incoming ICMP error * packet. */ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif); if (!asoc) return NULL; sk = asoc->base.sk; /* RFC 4960, Appendix C. ICMP Handling * * ICMP6) An implementation MUST validate that the Verification Tag * contained in the ICMP message matches the Verification Tag of * the peer. If the Verification Tag is not 0 and does NOT * match, discard the ICMP message. If it is 0 and the ICMP * message contains enough bytes to verify that the chunk type is * an INIT chunk and that the Initiate Tag matches the tag of the * peer, continue with ICMP7. If the ICMP message is too short * or the chunk type or the Initiate Tag does not match, silently * discard the packet. */ if (vtag == 0) { /* chunk header + first 4 octects of init header */ chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct sctphdr), sizeof(struct sctp_chunkhdr) + sizeof(__be32), &_chunkhdr); if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; } else if (vtag != asoc->c.peer_vtag) { goto out; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; return sk; out: sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) __releases(&((__sk)->sk_lock.slock)) { bh_unlock_sock(sk); sctp_transport_put(t); } static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) return; if (code == ICMP_FRAG_NEEDED) { sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); return; } if (code == ICMP_PROT_UNREACH) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: if (code == ICMP_EXC_FRAGTIME) return; err = EHOSTUNREACH; break; case ICMP_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: return; } if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the sctp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } sctp_v4_err_handle(transport, skb, type, code, info); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmphdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); if (hdr->type == ICMP_REDIRECT) { /* can't be handled without outer iphdr known, leave it to udp_err */ sctp_err_finish(sk, t); return 0; } if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) info = ntohs(hdr->un.frag.mtu); sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); sctp_err_finish(sk, t); return 1; } /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * * This function scans all the chunks in the OOTB packet to determine if * the packet should be discarded right away. If a response might be needed * for this packet, or, if further processing is possible, the packet will * be queued to a proper inqueue for the next phase of handling. * * Output: * Return 0 - If further processing is needed. * Return 1 - If the packet can be discarded right away. */ static int sctp_rcv_ootb(struct sk_buff *skb) { struct sctp_chunkhdr *ch, _ch; int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { /* Make sure we have at least the header there */ if (offset + sizeof(_ch) > skb->len) break; ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the * receiver MUST silently discard the OOTB packet and take no * further action. */ if (SCTP_CID_ABORT == ch->type) goto discard; /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE * chunk, the receiver should silently discard the packet * and take no further action. */ if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) goto discard; /* RFC 4460, 2.11.2 * This will discard packets with INIT chunk bundled as * subsequent chunks in the packet. When INIT is first, * the normal INIT processing will discard the chunk. */ if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; offset = ch_end; } while (ch_end < skb->len); return 0; discard: return 1; } /* Insert endpoint into the hash table. */ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; int err = 0; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; write_lock(&head->lock); if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; int cnt = 0; err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; sctp_for_each_hentry(ep2, &head->chain) { struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || !uid_eq(sk_uid(sk2), sk_uid(sk)) || !sk2->sk_reuseport) continue; err = sctp_bind_addrs_check(sctp_sk(sk2), sctp_sk(sk), cnt); if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) goto out; break; } else if (err < 0) { goto out; } } if (err) { err = reuseport_alloc(sk, any); if (err) goto out; } } hlist_add_head(&ep->node, &head->chain); out: write_unlock(&head->lock); return err; } /* Add an endpoint to the hash. Local BH-safe. */ int sctp_hash_endpoint(struct sctp_endpoint *ep) { int err; local_bh_disable(); err = __sctp_hash_endpoint(ep); local_bh_enable(); return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(sock_net(sk), ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; write_lock(&head->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); hlist_del_init(&ep->node); write_unlock(&head->lock); } /* Remove endpoint from the hash. Local BH-safe. */ void sctp_unhash_endpoint(struct sctp_endpoint *ep) { local_bh_disable(); __sctp_unhash_endpoint(ep); local_bh_enable(); } static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, const union sctp_addr *paddr, __u32 seed) { __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else addr = (__force __u32)paddr->v4.sin_addr.s_addr; return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } /* Look up an endpoint. */ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; __be16 lport; int hash; lport = laddr->v4.sin_port; hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(ep, &head->chain) { if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif)) goto hit; } ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: sk = ep->base.sk; if (sk->sk_reuseport) { __u32 phash = sctp_hashfn(net, lport, paddr, 0); sk = reuseport_select_sock(sk, phash, skb, sizeof(struct sctphdr)); if (sk) ep = sctp_sk(sk)->ep; } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; } /* rhashtable for transport */ struct sctp_hash_cmp_arg { const union sctp_addr *paddr; const struct net *net; __be16 lport; }; static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return err; if (!sctp_transport_hold(t)) return err; if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; err = 0; out: sctp_transport_put(t); return err; } static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { .head_offset = offsetof(struct sctp_transport, node), .hashfn = sctp_hash_key, .obj_hashfn = sctp_hash_obj, .obj_cmpfn = sctp_hash_cmp, .automatic_shrinking = true, }; int sctp_transport_hashtable_init(void) { return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params); } void sctp_transport_hashtable_destroy(void) { rhltable_destroy(&sctp_transport_hashtable); } int sctp_hash_transport(struct sctp_transport *t) { struct sctp_transport *transport; struct rhlist_head *tmp, *list; struct sctp_hash_cmp_arg arg; int err; if (t->asoc->temp) return 0; arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { rcu_read_unlock(); return -EEXIST; } rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); if (err) pr_err_once("insert transport fail, errno %d\n", err); return err; } void sctp_unhash_transport(struct sctp_transport *t) { if (t->asoc->temp) return; rhltable_remove(&sctp_transport_hashtable, &t->node, sctp_hash_params); } bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { bool l3mdev_accept = true; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept); #endif return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif); } /* return a transport with holding it */ struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct rhlist_head *tmp, *list; struct sctp_transport *t; int bound_dev_if; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = net, .lport = laddr->v4.sin_port, }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) { if (!sctp_transport_hold(t)) continue; bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if); if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && sctp_bind_addr_match(&t->asoc->base.bind_addr, laddr, sctp_sk(t->asoc->base.sk))) return t; sctp_transport_put(t); } return NULL; } /* return a transport without holding it, as it's only used under sock lock */ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) if (ep == t->asoc->ep) return t; return NULL; } /* Look up an association. */ static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif) { struct sctp_transport *t; struct sctp_association *asoc = NULL; t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif); if (!t) goto out; asoc = t->asoc; *pt = t; out: return asoc; } /* Look up an association. protected by RCU read lock */ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); rcu_read_unlock(); return asoc; } /* Is there an association matching the given local and peer addresses? */ bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_transport *transport; if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) { sctp_transport_put(transport); return true; } return false; } /* * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. * * D) When searching for a matching TCB upon reception of an INIT * or INIT-ACK chunk the receiver SHOULD use not only the * source address of the packet (containing the INIT or * INIT-ACK) but the receiver SHOULD also use all valid * address parameters contained within the chunk. * * 2.18.3 Solution description * * This new text clearly specifies to an implementor the need * to look within the INIT or INIT-ACK. Any implementation that * does not do this, may not be able to establish associations * in certain circumstances. * */ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; union sctp_addr addr; union sctp_addr *paddr = &addr; struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; struct sctp_init_chunk *init; struct sctp_af *af; /* * This code will NOT touch anything inside the chunk--it is * strictly READ-ONLY. * * RFC 2960 3 SCTP packet Format * * Multiple chunks can be bundled into one SCTP packet up to * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN * COMPLETE chunks. These chunks MUST NOT be bundled with any * other chunk in a packet. See Section 6.10 for more details * on chunk bundling. */ /* Find the start of the TLVs and the end of the chunk. This is * the region we search for address parameters. */ init = (struct sctp_init_chunk *)skb->data; /* Walk the parameters looking for embedded addresses. */ sctp_walk_params(params, init) { /* Note: Ignoring hostname addresses. */ af = sctp_get_af_specific(param_type2af(params.p->type)); if (!af) continue; if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) return asoc; } return NULL; } /* ADD-IP, Section 5.2 * When an endpoint receives an ASCONF Chunk from the remote peer * special procedures may be needed to identify the association the * ASCONF Chunk is associated with. To properly find the association * the following procedures SHOULD be followed: * * D2) If the association is not found, use the address found in the * Address Parameter TLV combined with the port number found in the * SCTP common header. If found proceed to rule D4. * * D2-ext) If more than one ASCONF Chunks are packed together, use the * address found in the ASCONF Address Parameter TLV of each of the * subsequent ASCONF Chunks. If found, proceed to rule D4. */ static struct sctp_association *__sctp_rcv_asconf_lookup( struct net *net, struct sctp_chunkhdr *ch, const union sctp_addr *laddr, __be16 peer_port, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch; struct sctp_af *af; union sctp_addr_param *param; union sctp_addr paddr; if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) return NULL; /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); af = sctp_get_af_specific(param_type2af(param->p.type)); if (unlikely(!af)) return NULL; if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif); } /* SCTP-AUTH, Section 6.3: * If the receiver does not find a STCB for a packet containing an AUTH * chunk as the first chunk and not a COOKIE-ECHO chunk as the second * chunk, it MUST use the chunks after the AUTH chunk to look up an existing * association. * * This means that any chunks that can help us identify the association need * to be looked at to find this association. */ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc = NULL; struct sctp_chunkhdr *ch; int have_auth = 0; unsigned int chunk_num = 1; __u8 *ch_end; /* Walk through the chunks looking for AUTH or ASCONF chunks * to help us find the association. */ ch = (struct sctp_chunkhdr *)skb->data; do { /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(*ch)) break; ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; switch (ch->type) { case SCTP_CID_AUTH: have_auth = chunk_num; break; case SCTP_CID_COOKIE_ECHO: /* If a packet arrives containing an AUTH chunk as * a first chunk, a COOKIE-ECHO chunk as the second * chunk, and possibly more chunks after them, and * the receiver does not have an STCB for that * packet, then authentication is based on * the contents of the COOKIE- ECHO chunk. */ if (have_auth == 1 && chunk_num == 2) return NULL; break; case SCTP_CID_ASCONF: if (have_auth || net->sctp.addip_noauth) asoc = __sctp_rcv_asconf_lookup( net, ch, laddr, sctp_hdr(skb)->source, transportp, dif, sdif); break; default: break; } if (asoc) break; ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } /* * There are circumstances when we need to look inside the SCTP packet * for information to help us find the association. Examples * include looking inside of INIT/INIT-ACK chunks or after the AUTH * chunks. */ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_chunkhdr *ch; /* We do not allow GSO frames here as we need to linearize and * then cannot guarantee frame boundaries. This shouldn't be an * issue as packets hitting this are mostly INIT or INIT-ACK and * those cannot be on GSO-style anyway. */ if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return NULL; ch = (struct sctp_chunkhdr *)skb->data; /* The code below will attempt to walk the chunk and extract * parameter information. Before we do that, we need to verify * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif); return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif); } /* Lookup an association for an inbound skb. */ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif); if (asoc) goto out; if (paddr->sa.sa_family == AF_INET) pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); else pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); out: return asoc; } |
| 7 7 8 8 7 8 8 6 8 8 8 202 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | /* * This file implement the Wireless Extensions proc API. * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * * (As all part of the Linux kernel, this file is GPL) */ /* * The /proc/net/wireless file is a human readable user-space interface * exporting various wireless specific statistics from the wireless devices. * This is the most popular part of the Wireless Extensions ;-) * * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). * The content of the file is basically the content of "struct iw_statistics". */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/wireless.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/iw_handler.h> #include <net/wext.h> static void wireless_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { /* Get stats from the driver */ struct iw_statistics *stats = get_wireless_stats(dev); static struct iw_statistics nullstats = {}; /* show device if it's wireless regardless of current stats */ if (!stats) { #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) stats = &nullstats; #endif #ifdef CONFIG_CFG80211 if (dev->ieee80211_ptr) stats = &nullstats; #endif } if (stats) { seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " "%6d %6d %6d\n", dev->name, stats->status, stats->qual.qual, stats->qual.updated & IW_QUAL_QUAL_UPDATED ? '.' : ' ', ((__s32) stats->qual.level) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_LEVEL_UPDATED ? '.' : ' ', ((__s32) stats->qual.noise) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_NOISE_UPDATED ? '.' : ' ', stats->discard.nwid, stats->discard.code, stats->discard.fragment, stats->discard.retries, stats->discard.misc, stats->miss.beacon); if (stats != &nullstats) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; } } /* ---------------------------------------------------------------- */ /* * Print info for /proc/net/wireless (print all entries) */ static int wireless_dev_seq_show(struct seq_file *seq, void *v) { might_sleep(); if (v == SEQ_START_TOKEN) seq_printf(seq, "Inter-| sta-| Quality | Discarded " "packets | Missed | WE\n" " face | tus | link level noise | nwid " "crypt frag retry misc | beacon | %d\n", WIRELESS_EXT); else |