Total coverage: 102746 (6%)of 1920261
35 2 2 1 2 2 7 5 2 1 2 2 22 33 11 2 5 87 28 10 51 2 7 7 8 7 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ #include "netlink.h" #include "main.h" #include <linux/array_size.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/limits.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" #include "multicast.h" #include "originator.h" #include "tp_meter.h" #include "translation-table.h" struct genl_family batadv_netlink_family; /* multicast groups */ enum batadv_netlink_multicast_groups { BATADV_NL_MCGRP_CONFIG, BATADV_NL_MCGRP_TPMETER, }; /** * enum batadv_genl_ops_flags - flags for genl_ops's internal_flags */ enum batadv_genl_ops_flags { /** * @BATADV_FLAG_NEED_MESH: request requires valid mesh interface in * attribute BATADV_ATTR_MESH_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[0] */ BATADV_FLAG_NEED_MESH = BIT(0), /** * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in * attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_HARDIF = BIT(1), /** * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in * attribute BATADV_ATTR_VLANID and expects a pointer to it to be * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_VLAN = BIT(2), }; static const struct genl_multicast_group batadv_netlink_mcgrps[] = { [BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG }, [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, }; static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_TQ] = { .type = NLA_U8 }, [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, [BATADV_ATTR_VLANID] = { .type = NLA_U16 }, [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 }, [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_GW_BANDWIDTH_DOWN] = { .type = NLA_U32 }, [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 }, [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, }; /** * batadv_netlink_get_ifindex() - Extract an interface index from a message * @nlh: Message header * @attrtype: Attribute which holds an interface index * * Return: interface index, or 0. */ static int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0; } /** * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation meshif attribute * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, struct batadv_priv *bat_priv) { struct batadv_meshif_vlan *vlan; u8 ap_isolation; vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return 0; ap_isolation = atomic_read(&vlan->ap_isolation); batadv_meshif_vlan_put(vlan); return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!ap_isolation); } /** * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr, struct batadv_priv *bat_priv) { struct batadv_meshif_vlan *vlan; vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return -ENOENT; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); batadv_meshif_vlan_put(vlan); return 0; } /** * batadv_netlink_mesh_fill() - Fill message with mesh attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the mesh interface information * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { struct net_device *mesh_iface = bat_priv->mesh_iface; struct batadv_hard_iface *primary_if = NULL; struct net_device *hard_iface; void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_priv->algo_ops->name) || nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, mesh_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, mesh_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, mesh_iface->dev_addr) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, (u8)atomic_read(&bat_priv->tt.vn))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC, ntohs(bat_priv->bla.claim_dest.group))) goto nla_put_failure; #endif if (batadv_mcast_mesh_info_put(msg, bat_priv)) goto nla_put_failure; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { hard_iface = primary_if->net_dev; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hard_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hard_iface->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, hard_iface->dev_addr)) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED, !!atomic_read(&bat_priv->aggregated_ogms))) goto nla_put_failure; if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK, bat_priv->isolation_mark)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK, bat_priv->isolation_mark_mask)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED, !!atomic_read(&bat_priv->bonding))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, !!atomic_read(&bat_priv->bridge_loop_avoidance))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BLA */ #ifdef CONFIG_BATMAN_ADV_DAT if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, !!atomic_read(&bat_priv->distributed_arp_table))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DAT */ if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED, !!atomic_read(&bat_priv->fragmentation))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN, atomic_read(&bat_priv->gw.bandwidth_down))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP, atomic_read(&bat_priv->gw.bandwidth_up))) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_GW_MODE, atomic_read(&bat_priv->gw.mode))) goto nla_put_failure; if (bat_priv->algo_ops->gw.get_best_gw_node && bat_priv->algo_ops->gw.is_eligible) { /* GW selection class is not available if the routing algorithm * in use does not implement the GW API */ if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS, atomic_read(&bat_priv->gw.sel_class))) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, atomic_read(&bat_priv->hop_penalty))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_DEBUG if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL, atomic_read(&bat_priv->log_level))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DEBUG */ #ifdef CONFIG_BATMAN_ADV_MCAST if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, !atomic_read(&bat_priv->multicast_mode))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT, atomic_read(&bat_priv->multicast_fanout))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; batadv_hardif_put(primary_if); genlmsg_end(msg, hdr); return 0; nla_put_failure: batadv_hardif_put(primary_if); genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_mesh() - send meshif attributes to listener * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH, 0, 0, 0); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_mesh() - Get meshif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH, info->snd_portid, info->snd_seq, 0); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_mesh() - Set meshif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) { attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]; atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr)); } if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; batadv_netlink_set_mesh_ap_isolation(attr, bat_priv); } if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) { attr = info->attrs[BATADV_ATTR_ISOLATION_MARK]; bat_priv->isolation_mark = nla_get_u32(attr); } if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) { attr = info->attrs[BATADV_ATTR_ISOLATION_MASK]; bat_priv->isolation_mark_mask = nla_get_u32(attr); } if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) { attr = info->attrs[BATADV_ATTR_BONDING_ENABLED]; atomic_set(&bat_priv->bonding, !!nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_BLA if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) { attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]; atomic_set(&bat_priv->bridge_loop_avoidance, !!nla_get_u8(attr)); batadv_bla_status_update(bat_priv->mesh_iface); } #endif /* CONFIG_BATMAN_ADV_BLA */ #ifdef CONFIG_BATMAN_ADV_DAT if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) { attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]; atomic_set(&bat_priv->distributed_arp_table, !!nla_get_u8(attr)); batadv_dat_status_update(bat_priv->mesh_iface); } #endif /* CONFIG_BATMAN_ADV_DAT */ if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]; atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); rtnl_lock(); batadv_update_min_mtu(bat_priv->mesh_iface); rtnl_unlock(); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) { attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]; atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr)); batadv_gw_tvlv_container_update(bat_priv); } if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) { attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]; atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr)); batadv_gw_tvlv_container_update(bat_priv); } if (info->attrs[BATADV_ATTR_GW_MODE]) { u8 gw_mode; attr = info->attrs[BATADV_ATTR_GW_MODE]; gw_mode = nla_get_u8(attr); if (gw_mode <= BATADV_GW_MODE_SERVER) { /* Invoking batadv_gw_reselect() is not enough to really * de-select the current GW. It will only instruct the * gateway client code to perform a re-election the next * time that this is needed. * * When gw client mode is being switched off the current * GW must be de-selected explicitly otherwise no GW_ADD * uevent is thrown on client mode re-activation. This * is operation is performed in * batadv_gw_check_client_stop(). */ batadv_gw_reselect(bat_priv); /* always call batadv_gw_check_client_stop() before * changing the gateway state */ batadv_gw_check_client_stop(bat_priv); atomic_set(&bat_priv->gw.mode, gw_mode); batadv_gw_tvlv_container_update(bat_priv); } } if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] && bat_priv->algo_ops->gw.get_best_gw_node && bat_priv->algo_ops->gw.is_eligible) { /* setting the GW selection class is allowed only if the routing * algorithm in use implements the GW API */ u32 sel_class_max = bat_priv->algo_ops->gw.sel_class_max; u32 sel_class; attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS]; sel_class = nla_get_u32(attr); if (sel_class >= 1 && sel_class <= sel_class_max) { atomic_set(&bat_priv->gw.sel_class, sel_class); batadv_gw_reselect(bat_priv); } } if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_DEBUG if (info->attrs[BATADV_ATTR_LOG_LEVEL]) { attr = info->attrs[BATADV_ATTR_LOG_LEVEL]; atomic_set(&bat_priv->log_level, nla_get_u32(attr) & BATADV_DBG_ALL); } #endif /* CONFIG_BATMAN_ADV_DEBUG */ #ifdef CONFIG_BATMAN_ADV_MCAST if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) { attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]; atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr)); } if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) { attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT]; atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr)); } #endif /* CONFIG_BATMAN_ADV_MCAST */ if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { u32 orig_interval; attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL]; orig_interval = nla_get_u32(attr); orig_interval = min_t(u32, orig_interval, INT_MAX); orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER); atomic_set(&bat_priv->orig_interval, orig_interval); } batadv_netlink_notify_mesh(bat_priv); return 0; } /** * batadv_netlink_tp_meter_put() - Fill information of started tp_meter session * @msg: netlink message to be sent back * @cookie: tp meter session cookie * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) { if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) return -ENOBUFS; return 0; } /** * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client * @bat_priv: the bat priv with all the mesh interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * * Return: 0 on success, < 0 on error */ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie) { struct sk_buff *msg; void *hdr; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &batadv_netlink_family, 0, BATADV_CMD_TP_METER); if (!hdr) { ret = -ENOBUFS; goto err_genlmsg; } if (nla_put_u32(msg, BATADV_ATTR_TPMETER_COOKIE, cookie)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_TPMETER_TEST_TIME, test_time)) goto nla_put_failure; if (nla_put_u64_64bit(msg, BATADV_ATTR_TPMETER_BYTES, total_bytes, BATADV_ATTR_PAD)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_TPMETER_RESULT, result)) goto nla_put_failure; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, dst)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_TPMETER, GFP_KERNEL); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); ret = -EMSGSIZE; err_genlmsg: nlmsg_free(msg); return ret; } /** * batadv_netlink_tp_meter_start() - Start a new tp_meter session * @skb: received netlink message * @info: receiver information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg = NULL; u32 test_length; void *msg_head; u32 cookie; u8 *dst; int ret; if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) return -EINVAL; if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]) return -EINVAL; dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, &batadv_netlink_family, 0, BATADV_CMD_TP_METER); if (!msg_head) { ret = -ENOBUFS; goto out; } batadv_tp_start(bat_priv, dst, test_length, &cookie); ret = batadv_netlink_tp_meter_put(msg, cookie); out: if (ret) { if (msg) nlmsg_free(msg); return ret; } genlmsg_end(msg, msg_head); return genlmsg_reply(msg, info); } /** * batadv_netlink_tp_meter_cancel() - Cancel a running tp_meter session * @skb: received netlink message * @info: receiver information * * Return: 0 on success, < 0 on error */ static int batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; u8 *dst; int ret = 0; if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS]) return -EINVAL; dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]); batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL); return ret; } /** * batadv_netlink_hardif_fill() - Fill message with hardif attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: hard interface which was modified * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * @cb: Control block containing additional options * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_hardif_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags, struct netlink_callback *cb) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, bat_priv->mesh_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, bat_priv->mesh_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, net_dev->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, net_dev->dev_addr)) goto nla_put_failure; if (hard_iface->if_status == BATADV_IF_ACTIVE) { if (nla_put_flag(msg, BATADV_ATTR_ACTIVE)) goto nla_put_failure; } if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, atomic_read(&hard_iface->hop_penalty))) goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE, atomic_read(&hard_iface->bat_v.throughput_override))) goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_hardif() - send hardif attributes to listener * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: hard interface which was modified * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_cmd_get_hardif() - Get hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_cmd_get_hardif(struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_GET_HARDIF, info->snd_portid, info->snd_seq, 0, NULL); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_hardif() - Set hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_hardif(struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); } #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr)); } if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) { attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]; atomic_set(&hard_iface->bat_v.throughput_override, nla_get_u32(attr)); } #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ batadv_netlink_notify_hardif(bat_priv, hard_iface); return 0; } /** * batadv_netlink_dump_hardif() - Dump all hard interface into a messages * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: error code, or length of reply message on success */ static int batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) { struct net_device *mesh_iface; struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv; int portid = NETLINK_CB(cb->skb).portid; int skip = cb->args[0]; struct list_head *iter; int i = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); rtnl_lock(); cb->seq = batadv_hardif_generation << 1 | 1; netdev_for_each_lower_private(mesh_iface, hard_iface, iter) { if (i++ < skip) continue; if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, BATADV_CMD_GET_HARDIF, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb)) { i--; break; } } rtnl_unlock(); dev_put(mesh_iface); cb->args[0] = i; return msg->len; } /** * batadv_netlink_vlan_fill() - Fill message with vlan attributes * @msg: Netlink message to dump into * @bat_priv: the bat priv with all the mesh interface information * @vlan: vlan which was modified * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message * @flags: Additional flags for message * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_vlan_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, struct batadv_meshif_vlan *vlan, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, bat_priv->mesh_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, bat_priv->mesh_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!atomic_read(&vlan->ap_isolation))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_netlink_notify_vlan() - send vlan attributes to listener * @bat_priv: the bat priv with all the mesh interface information * @vlan: vlan which was modified * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, struct batadv_meshif_vlan *vlan) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_SET_VLAN, 0, 0, 0); if (ret < 0) { nlmsg_free(msg); return ret; } genlmsg_multicast_netns(&batadv_netlink_family, dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** * batadv_netlink_get_vlan() - Get vlan attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_meshif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN, info->snd_portid, info->snd_seq, 0); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_reply(msg, info); return ret; } /** * batadv_netlink_set_vlan() - Get vlan attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_meshif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); } batadv_netlink_notify_vlan(bat_priv, vlan); return 0; } /** * batadv_netlink_get_meshif_from_ifindex() - Get mesh-iface from ifindex * @net: the applicable net namespace * @ifindex: index of the mesh interface * * Return: Pointer to mesh interface (with increased refcnt) on success, error * pointer on error */ static struct net_device * batadv_netlink_get_meshif_from_ifindex(struct net *net, int ifindex) { struct net_device *mesh_iface; mesh_iface = dev_get_by_index(net, ifindex); if (!mesh_iface) return ERR_PTR(-ENODEV); if (!batadv_meshif_is_valid(mesh_iface)) goto err_put_meshif; return mesh_iface; err_put_meshif: dev_put(mesh_iface); return ERR_PTR(-EINVAL); } /** * batadv_netlink_get_meshif_from_info() - Get mesh-iface from genl attributes * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to mesh interface (with increased refcnt) on success, error * pointer on error */ static struct net_device * batadv_netlink_get_meshif_from_info(struct net *net, struct genl_info *info) { int ifindex; if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) return ERR_PTR(-EINVAL); ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); return batadv_netlink_get_meshif_from_ifindex(net, ifindex); } /** * batadv_netlink_get_meshif() - Retrieve mesh interface from netlink callback * @cb: callback structure containing arguments * * Return: Pointer to mesh interface (with increased refcnt) on success, error * pointer on error */ struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb) { int ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return ERR_PTR(-ENONET); return batadv_netlink_get_meshif_from_ifindex(sock_net(cb->skb->sk), ifindex); } /** * batadv_netlink_get_hardif_from_ifindex() - Get hard-iface from ifindex * @bat_priv: the bat priv with all the mesh interface information * @net: the applicable net namespace * @ifindex: index of the hard interface * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ static struct batadv_hard_iface * batadv_netlink_get_hardif_from_ifindex(struct batadv_priv *bat_priv, struct net *net, int ifindex) { struct batadv_hard_iface *hard_iface; struct net_device *hard_dev; hard_dev = dev_get_by_index(net, ifindex); if (!hard_dev) return ERR_PTR(-ENODEV); hard_iface = batadv_hardif_get_by_netdev(hard_dev); if (!hard_iface) goto err_put_harddev; if (hard_iface->mesh_iface != bat_priv->mesh_iface) goto err_put_hardif; /* hard_dev is referenced by hard_iface and not needed here */ dev_put(hard_dev); return hard_iface; err_put_hardif: batadv_hardif_put(hard_iface); err_put_harddev: dev_put(hard_dev); return ERR_PTR(-EINVAL); } /** * batadv_netlink_get_hardif_from_info() - Get hard-iface from genl attributes * @bat_priv: the bat priv with all the mesh interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ static struct batadv_hard_iface * batadv_netlink_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { int ifindex; if (!info->attrs[BATADV_ATTR_HARD_IFINDEX]) return ERR_PTR(-EINVAL); ifindex = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]); return batadv_netlink_get_hardif_from_ifindex(bat_priv, net, ifindex); } /** * batadv_netlink_get_hardif() - Retrieve hard interface from netlink callback * @bat_priv: the bat priv with all the mesh interface information * @cb: callback structure containing arguments * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ struct batadv_hard_iface * batadv_netlink_get_hardif(struct batadv_priv *bat_priv, struct netlink_callback *cb) { int ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_HARD_IFINDEX); if (!ifindex) return ERR_PTR(-ENONET); return batadv_netlink_get_hardif_from_ifindex(bat_priv, sock_net(cb->skb->sk), ifindex); } /** * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes * @bat_priv: the bat priv with all the mesh interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to vlan on success (with increased refcnt), error pointer * on error */ static struct batadv_meshif_vlan * batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { struct batadv_meshif_vlan *vlan; u16 vid; if (!info->attrs[BATADV_ATTR_VLANID]) return ERR_PTR(-EINVAL); vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]); vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return ERR_PTR(-ENOENT); return vlan; } /** * batadv_pre_doit() - Prepare batman-adv genl doit request * @ops: requested netlink operation * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ static int batadv_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv = NULL; struct batadv_meshif_vlan *vlan; struct net_device *mesh_iface; u8 user_ptr1_flags; u8 mesh_dep_flags; int ret; user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1)) return -EINVAL; mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON((ops->internal_flags & mesh_dep_flags) && (~ops->internal_flags & BATADV_FLAG_NEED_MESH))) return -EINVAL; if (ops->internal_flags & BATADV_FLAG_NEED_MESH) { mesh_iface = batadv_netlink_get_meshif_from_info(net, info); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); info->user_ptr[0] = bat_priv; } if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) { hard_iface = batadv_netlink_get_hardif_from_info(bat_priv, net, info); if (IS_ERR(hard_iface)) { ret = PTR_ERR(hard_iface); goto err_put_meshif; } info->user_ptr[1] = hard_iface; } if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) { vlan = batadv_get_vlan_from_info(bat_priv, net, info); if (IS_ERR(vlan)) { ret = PTR_ERR(vlan); goto err_put_meshif; } info->user_ptr[1] = vlan; } return 0; err_put_meshif: if (bat_priv) dev_put(bat_priv->mesh_iface); return ret; } /** * batadv_post_doit() - End batman-adv genl doit request * @ops: requested netlink operation * @skb: Netlink message with request data * @info: receiver information */ static void batadv_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface; struct batadv_meshif_vlan *vlan; struct batadv_priv *bat_priv; if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF && info->user_ptr[1]) { hard_iface = info->user_ptr[1]; batadv_hardif_put(hard_iface); } if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) { vlan = info->user_ptr[1]; batadv_meshif_vlan_put(vlan); } if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) { bat_priv = info->user_ptr[0]; dev_put(bat_priv->mesh_iface); } } static const struct genl_small_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = batadv_netlink_get_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_tp_meter_start, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER_CANCEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_tp_meter_cancel, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_GET_ROUTING_ALGOS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_algo_dump, }, { .cmd = BATADV_CMD_GET_HARDIF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .dumpit = batadv_netlink_dump_hardif, .doit = batadv_netlink_cmd_get_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_tt_local_dump, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_tt_global_dump, }, { .cmd = BATADV_CMD_GET_ORIGINATORS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_orig_dump, }, { .cmd = BATADV_CMD_GET_NEIGHBORS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_hardif_neigh_dump, }, { .cmd = BATADV_CMD_GET_GATEWAYS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_gw_dump, }, { .cmd = BATADV_CMD_GET_BLA_CLAIM, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_bla_claim_dump, }, { .cmd = BATADV_CMD_GET_BLA_BACKBONE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_bla_backbone_dump, }, { .cmd = BATADV_CMD_GET_DAT_CACHE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_dat_cache_dump, }, { .cmd = BATADV_CMD_GET_MCAST_FLAGS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .dumpit = batadv_mcast_flags_dump, }, { .cmd = BATADV_CMD_SET_MESH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_SET_HARDIF, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_VLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .doit = batadv_netlink_get_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, { .cmd = BATADV_CMD_SET_VLAN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = batadv_netlink_set_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, }; struct genl_family batadv_netlink_family __ro_after_init = { .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, .maxattr = BATADV_ATTR_MAX, .policy = batadv_netlink_policy, .netnsok = true, .pre_doit = batadv_pre_doit, .post_doit = batadv_post_doit, .module = THIS_MODULE, .small_ops = batadv_netlink_ops, .n_small_ops = ARRAY_SIZE(batadv_netlink_ops), .resv_start_op = BATADV_CMD_SET_VLAN + 1, .mcgrps = batadv_netlink_mcgrps, .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps), }; /** * batadv_netlink_register() - register batadv genl netlink family */ void __init batadv_netlink_register(void) { int ret; ret = genl_register_family(&batadv_netlink_family); if (ret) pr_warn("unable to register netlink family\n"); } /** * batadv_netlink_unregister() - unregister batadv genl netlink family */ void batadv_netlink_unregister(void) { genl_unregister_family(&batadv_netlink_family); }
3 7 7 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 /* * net/tipc/monitor.c * * Copyright (c) 2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/genetlink.h> #include "core.h" #include "addr.h" #include "monitor.h" #include "bearer.h" #define MAX_MON_DOMAIN 64 #define MON_TIMEOUT 120000 #define MAX_PEER_DOWN_EVENTS 4 /* struct tipc_mon_domain: domain record to be transferred between peers * @len: actual size of domain record * @gen: current generation of sender's domain * @ack_gen: most recent generation of self's domain acked by peer * @member_cnt: number of domain member nodes described in this record * @up_map: bit map indicating which of the members the sender considers up * @members: identity of the domain members */ struct tipc_mon_domain { u16 len; u16 gen; u16 ack_gen; u16 member_cnt; u64 up_map; u32 members[MAX_MON_DOMAIN]; }; /* struct tipc_peer: state of a peer node and its domain * @addr: tipc node identity of peer * @head_map: shows which other nodes currently consider peer 'up' * @domain: most recent domain record from peer * @hash: position in hashed lookup list * @list: position in linked list, in circular ascending order by 'addr' * @applied: number of reported domain members applied on this monitor list * @is_up: peer is up as seen from this node * @is_head: peer is assigned domain head as seen from this node * @is_local: peer is in local domain and should be continuously monitored * @down_cnt: - numbers of other peers which have reported this on lost */ struct tipc_peer { u32 addr; struct tipc_mon_domain *domain; struct hlist_node hash; struct list_head list; u8 applied; u8 down_cnt; bool is_up; bool is_head; bool is_local; }; struct tipc_monitor { struct hlist_head peers[NODE_HTABLE_SIZE]; int peer_cnt; struct tipc_peer *self; rwlock_t lock; struct tipc_mon_domain cache; u16 list_gen; u16 dom_gen; struct net *net; struct timer_list timer; unsigned long timer_intv; }; static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) { return tipc_net(net)->monitors[bearer_id]; } const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); static inline u16 mon_cpu_to_le16(u16 val) { return (__force __u16)htons(val); } static inline u32 mon_cpu_to_le32(u32 val) { return (__force __u32)htonl(val); } static inline u64 mon_cpu_to_le64(u64 val) { return (__force __u64)cpu_to_be64(val); } static inline u16 mon_le16_to_cpu(u16 val) { return ntohs((__force __be16)val); } static inline u32 mon_le32_to_cpu(u32 val) { return ntohl((__force __be32)val); } static inline u64 mon_le64_to_cpu(u64 val) { return be64_to_cpu((__force __be64)val); } /* dom_rec_len(): actual length of domain record for transport */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) { return (offsetof(struct tipc_mon_domain, members)) + (mcnt * sizeof(u32)); } /* dom_size() : calculate size of own domain based on number of peers */ static int dom_size(int peers) { int i = 0; while ((i * i) < peers) i++; return min(i, MAX_MON_DOMAIN); } static void map_set(u64 *up_map, int i, unsigned int v) { *up_map &= ~(1ULL << i); *up_map |= ((u64)v << i); } static int map_get(u64 up_map, int i) { return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) { return list_last_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_nxt(struct tipc_peer *peer) { return list_first_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_head(struct tipc_peer *peer) { while (!peer->is_head) peer = peer_prev(peer); return peer; } static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) { struct tipc_peer *peer; unsigned int thash = tipc_hashfn(addr); hlist_for_each_entry(peer, &mon->peers[thash], hash) { if (peer->addr == addr) return peer; } return NULL; } static struct tipc_peer *get_self(struct net *net, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); return mon->self; } static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) { struct tipc_net *tn = tipc_net(net); return mon->peer_cnt > tn->mon_threshold; } /* mon_identify_lost_members() : - identify amd mark potentially lost members */ static void mon_identify_lost_members(struct tipc_peer *peer, struct tipc_mon_domain *dom_bef, int applied_bef) { struct tipc_peer *member = peer; struct tipc_mon_domain *dom_aft = peer->domain; int applied_aft = peer->applied; int i; for (i = 0; i < applied_bef; i++) { member = peer_nxt(member); /* Do nothing if self or peer already see member as down */ if (!member->is_up || !map_get(dom_bef->up_map, i)) continue; /* Loss of local node must be detected by active probing */ if (member->is_local) continue; /* Start probing if member was removed from applied domain */ if (!applied_aft || (applied_aft < i)) { member->down_cnt = 1; continue; } /* Member loss is confirmed if it is still in applied domain */ if (!map_get(dom_aft->up_map, i)) member->down_cnt++; } } /* mon_apply_domain() : match a peer's domain record against monitor list */ static void mon_apply_domain(struct tipc_monitor *mon, struct tipc_peer *peer) { struct tipc_mon_domain *dom = peer->domain; struct tipc_peer *member; u32 addr; int i; if (!dom || !peer->is_up) return; /* Scan across domain members and match against monitor list */ peer->applied = 0; member = peer_nxt(peer); for (i = 0; i < dom->member_cnt; i++) { addr = dom->members[i]; if (addr != member->addr) return; peer->applied++; member = peer_nxt(member); } } /* mon_update_local_domain() : update after peer addition/removal/up/down */ static void mon_update_local_domain(struct tipc_monitor *mon) { struct tipc_peer *self = mon->self; struct tipc_mon_domain *cache = &mon->cache; struct tipc_mon_domain *dom = self->domain; struct tipc_peer *peer = self; u64 prev_up_map = dom->up_map; u16 member_cnt, i; bool diff; /* Update local domain size based on current size of cluster */ member_cnt = dom_size(mon->peer_cnt) - 1; self->applied = member_cnt; /* Update native and cached outgoing local domain records */ dom->len = dom_rec_len(dom, member_cnt); diff = dom->member_cnt != member_cnt; dom->member_cnt = member_cnt; for (i = 0; i < member_cnt; i++) { peer = peer_nxt(peer); diff |= dom->members[i] != peer->addr; dom->members[i] = peer->addr; map_set(&dom->up_map, i, peer->is_up); cache->members[i] = mon_cpu_to_le32(peer->addr); } diff |= dom->up_map != prev_up_map; if (!diff) return; dom->gen = ++mon->dom_gen; cache->len = mon_cpu_to_le16(dom->len); cache->gen = mon_cpu_to_le16(dom->gen); cache->member_cnt = mon_cpu_to_le16(member_cnt); cache->up_map = mon_cpu_to_le64(dom->up_map); mon_apply_domain(mon, self); } /* mon_update_neighbors() : update preceding neighbors of added/removed peer */ static void mon_update_neighbors(struct tipc_monitor *mon, struct tipc_peer *peer) { int dz, i; dz = dom_size(mon->peer_cnt); for (i = 0; i < dz; i++) { mon_apply_domain(mon, peer); peer = peer_prev(peer); } } /* mon_assign_roles() : reassign peer roles after a network change * The monitor list is consistent at this stage; i.e., each peer is monitoring * a set of domain members as matched between domain record and the monitor list */ static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) { struct tipc_peer *peer = peer_nxt(head); struct tipc_peer *self = mon->self; int i = 0; for (; peer != self; peer = peer_nxt(peer)) { peer->is_local = false; /* Update domain member */ if (i++ < head->applied) { peer->is_head = false; if (head == self) peer->is_local = true; continue; } /* Assign next domain head */ if (!peer->is_up) continue; if (peer->is_head) break; head = peer; head->is_head = true; i = 0; } mon->list_gen++; } void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *prev, *head; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) goto exit; prev = peer_prev(peer); list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); mon->peer_cnt--; head = peer_head(prev); if (head == self) mon_update_local_domain(mon); mon_update_neighbors(mon, prev); /* Revert to full-mesh monitoring if we reach threshold */ if (!tipc_mon_is_active(net, mon)) { list_for_each_entry(peer, &self->list, list) { kfree(peer->domain); peer->domain = NULL; peer->applied = 0; } } mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, struct tipc_peer **peer) { struct tipc_peer *self = mon->self; struct tipc_peer *cur, *prev, *p; p = kzalloc_obj(*p, GFP_ATOMIC); *peer = p; if (!p) return false; p->addr = addr; /* Add new peer to lookup list */ INIT_LIST_HEAD(&p->list); hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); /* Sort new peer into iterator list, in ascending circular order */ prev = self; list_for_each_entry(cur, &self->list, list) { if ((addr > prev->addr) && (addr < cur->addr)) break; if (((addr < cur->addr) || (addr > prev->addr)) && (prev->addr > cur->addr)) break; prev = cur; } list_add_tail(&p->list, &cur->list); mon->peer_cnt++; mon_update_neighbors(mon, p); return true; } void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self = get_self(net, bearer_id); struct tipc_peer *peer, *head; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) goto exit; peer->is_up = true; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *head; struct tipc_mon_domain *dom; int applied; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) { pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); goto exit; } applied = peer->applied; peer->applied = 0; dom = peer->domain; peer->domain = NULL; if (peer->is_head) mon_identify_lost_members(peer, dom, applied); kfree(dom); peer->is_up = false; peer->is_head = false; peer->is_local = false; peer->down_cnt = 0; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } /* tipc_mon_rcv - process monitor domain event message */ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *arrv_dom = data; struct tipc_mon_domain dom_bef; struct tipc_mon_domain *dom; struct tipc_peer *peer; u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt); int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); u16 new_gen = mon_le16_to_cpu(arrv_dom->gen); u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen); u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len); bool probing = state->probing; int i, applied_bef; state->probing = false; /* Sanity check received domain record */ if (new_member_cnt > MAX_MON_DOMAIN) return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) return; if (dlen < new_dlen || arrv_dlen != new_dlen) return; /* Synch generation numbers with peer if link just came up */ if (!state->synched) { state->peer_gen = new_gen - 1; state->acked_gen = acked_gen; state->synched = true; } if (more(acked_gen, state->acked_gen)) state->acked_gen = acked_gen; /* Drop duplicate unless we are waiting for a probe response */ if (!more(new_gen, state->peer_gen) && !probing) return; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer || !peer->is_up) goto exit; /* Peer is confirmed, stop any ongoing probing */ peer->down_cnt = 0; /* Task is done for duplicate record */ if (!more(new_gen, state->peer_gen)) goto exit; state->peer_gen = new_gen; /* Cache current domain record for later use */ dom_bef.member_cnt = 0; dom = peer->domain; if (dom) memcpy(&dom_bef, dom, dom->len); /* Transform and store received domain record */ if (!dom || (dom->len < new_dlen)) { kfree(dom); dom = kmalloc(new_dlen, GFP_ATOMIC); peer->domain = dom; if (!dom) goto exit; } dom->len = new_dlen; dom->gen = new_gen; dom->member_cnt = new_member_cnt; dom->up_map = mon_le64_to_cpu(arrv_dom->up_map); for (i = 0; i < new_member_cnt; i++) dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]); /* Update peers affected by this domain record */ applied_bef = peer->applied; mon_apply_domain(mon, peer); mon_identify_lost_members(peer, &dom_bef, applied_bef); mon_assign_roles(mon, peer_head(peer)); exit: write_unlock_bh(&mon->lock); } void tipc_mon_prep(struct net *net, void *data, int *dlen, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *dom = data; u16 gen = mon->dom_gen; u16 len; /* Send invalid record if not active */ if (!tipc_mon_is_active(net, mon)) { dom->len = 0; return; } /* Send only a dummy record with ack if peer has acked our last sent */ if (likely(state->acked_gen == gen)) { len = dom_rec_len(dom, 0); *dlen = len; dom->len = mon_cpu_to_le16(len); dom->gen = mon_cpu_to_le16(gen); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); dom->member_cnt = 0; return; } /* Send the full record */ read_lock_bh(&mon->lock); len = mon_le16_to_cpu(mon->cache.len); *dlen = len; memcpy(data, &mon->cache, len); read_unlock_bh(&mon->lock); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); } void tipc_mon_get_state(struct net *net, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!tipc_mon_is_active(net, mon)) { state->probing = false; state->monitoring = true; return; } /* Used cached state if table has not changed */ if (!state->probing && (state->list_gen == mon->list_gen) && (state->acked_gen == mon->dom_gen)) return; read_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (peer) { state->probing = state->acked_gen != mon->dom_gen; state->probing |= peer->down_cnt; state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; state->monitoring = peer->is_local; state->monitoring |= peer->is_head; state->list_gen = mon->list_gen; } read_unlock_bh(&mon->lock); } static void mon_timeout(struct timer_list *t) { struct tipc_monitor *mon = timer_container_of(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; write_lock_bh(&mon->lock); self = mon->self; if (self && (best_member_cnt != self->applied)) { mon_update_local_domain(mon); mon_assign_roles(mon, self); } write_unlock_bh(&mon->lock); mod_timer(&mon->timer, jiffies + mon->timer_intv); } int tipc_mon_create(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon; struct tipc_peer *self; struct tipc_mon_domain *dom; if (tn->monitors[bearer_id]) return 0; mon = kzalloc_obj(*mon, GFP_ATOMIC); self = kzalloc_obj(*self, GFP_ATOMIC); dom = kzalloc_obj(*dom, GFP_ATOMIC); if (!mon || !self || !dom) { kfree(mon); kfree(self); kfree(dom); return -ENOMEM; } tn->monitors[bearer_id] = mon; rwlock_init(&mon->lock); mon->net = net; mon->peer_cnt = 1; mon->self = self; self->domain = dom; self->addr = tipc_own_addr(net); self->is_up = true; self->is_head = true; INIT_LIST_HEAD(&self->list); timer_setup(&mon->timer, mon_timeout, 0); mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); mod_timer(&mon->timer, jiffies + mon->timer_intv); return 0; } void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *tmp; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); } mon->self = NULL; write_unlock_bh(&mon->lock); timer_shutdown_sync(&mon->timer); kfree(self->domain); kfree(self); kfree(mon); } void tipc_mon_reinit_self(struct net *net) { struct tipc_monitor *mon; int bearer_id; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { mon = tipc_monitor(net, bearer_id); if (!mon) continue; write_lock_bh(&mon->lock); if (mon->self) mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); if (cluster_size > TIPC_CLUSTER_SIZE) return -EINVAL; tn->mon_threshold = cluster_size; return 0; } int tipc_nl_monitor_get_threshold(struct net *net) { struct tipc_net *tn = tipc_net(net); return tn->mon_threshold; } static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_PEER_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON_PEER); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) goto attr_msg_full; if (peer->is_up) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) goto attr_msg_full; if (peer->is_local) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) goto attr_msg_full; if (peer->is_head) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) goto attr_msg_full; if (dom) { if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, dom->up_map, TIPC_NLA_MON_PEER_PAD)) goto attr_msg_full; if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, dom->member_cnt * sizeof(u32), &dom->members)) goto attr_msg_full; } nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!mon) return -EINVAL; read_lock_bh(&mon->lock); peer = mon->self; do { if (*prev_node) { if (peer->addr == *prev_node) *prev_node = 0; else continue; } if (__tipc_nl_add_monitor_peer(peer, msg)) { *prev_node = peer->addr; read_unlock_bh(&mon->lock); return -EMSGSIZE; } } while ((peer = peer_nxt(peer)) != mon->self); read_unlock_bh(&mon->lock); return 0; } int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); char bearer_name[TIPC_MAX_BEARER_NAME]; struct nlattr *attrs; void *hdr; int ret; ret = tipc_bearer_get_name(net, bearer_name, bearer_id); if (ret || !mon) return 0; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; read_lock_bh(&mon->lock); if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) goto attr_msg_full; if (tipc_mon_is_active(net, mon)) if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) goto attr_msg_full; read_unlock_bh(&mon->lock); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: read_unlock_bh(&mon->lock); nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; }
130 3856 1375 544 542 1071 27 391 12 12 380 3224 593 593 3175 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NS_COMMON_H #define _LINUX_NS_COMMON_H #include <linux/ns/ns_common_types.h> #include <linux/refcount.h> #include <linux/vfsdebug.h> #include <uapi/linux/sched.h> #include <uapi/linux/nsfs.h> bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); struct ns_common *__must_check ns_owner(struct ns_common *ns); static __always_inline bool is_ns_init_inum(const struct ns_common *ns) { VFS_WARN_ON_ONCE(ns->inum == 0); return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); } static __always_inline bool is_ns_init_id(const struct ns_common *ns) { VFS_WARN_ON_ONCE(ns->ns_id == 0); return ns->ns_id <= NS_LAST_INIT_ID; } #define NS_COMMON_INIT(nsname) \ { \ .ns_type = ns_common_type(&nsname), \ .ns_id = ns_init_id(&nsname), \ .inum = ns_init_inum(&nsname), \ .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ .__ns_ref = REFCOUNT_INIT(1), \ .__ns_ref_active = ATOMIC_INIT(1), \ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \ .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \ } #define ns_common_init(__ns) \ __ns_common_init(to_ns_common(__ns), \ ns_common_type(__ns), \ to_ns_operations(__ns), \ (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) #define ns_common_init_inum(__ns, __inum) \ __ns_common_init(to_ns_common(__ns), \ ns_common_type(__ns), \ to_ns_operations(__ns), \ __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) bool may_see_all_namespaces(void); static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) { return atomic_read(&ns->__ns_ref_active); } static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) { return refcount_read(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return false; } if (refcount_dec_and_test(&ns->__ns_ref)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return true; } return false; } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return true; } if (refcount_inc_not_zero(&ns->__ns_ref)) return true; VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return false; } static __always_inline void __ns_ref_inc(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return; } refcount_inc(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, spinlock_t *ns_lock) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return false; } return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); } #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) #define ns_ref_inc(__ns) \ do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0) #define ns_ref_get(__ns) \ ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false) #define ns_ref_put(__ns) \ ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false) #define ns_ref_put_and_lock(__ns, __ns_lock) \ ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false) #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) void __ns_ref_active_put(struct ns_common *ns); #define ns_ref_active_put(__ns) \ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { if (!__ns_ref_active_read(ns)) { VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; } if (!__ns_ref_get(ns)) return NULL; return ns; } void __ns_ref_active_get(struct ns_common *ns); #define ns_ref_active_get(__ns) \ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif
64 52 35 56 28 65 64 70 70 34 34 79 79 79 78 61 61 61 43 18 61 33 33 18 4 12 6 6 13 4 17 17 16 14 2 12 4 2 2 4 4 4 6 1 244 244 243 60 254 243 269 267 13 263 32 21 227 80 183 250 18 198 5 8 27 145 20 121 7 7 22 1 13 3 5 7 4 2 4 6 6 6 2 13 130 22 12 24 135 67 2 65 9 9 1172 68 9 67 11 6 11 20 107 19 24 74 175 16 18 27 81 30 41 33 6 9 7 23 3 49 7 7 41 4 2 4 6 4 2 6 17 20 23 3 11 2 13 5 11 5 2 4 18 9 7 10 3 3 5 126 2 14 15 12 21 16 3 2 8 46 2 4 3 5 23 25 2 20 23 50 6 6 9 8 9 22 12 10 9 9 18 9 10 9 8 2 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> #include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> #include <linux/bpf_verifier.h> #include <linux/uaccess.h> #include <linux/verification.h> #include <linux/task_work.h> #include <linux/irq_work.h> #include <linux/buildid.h> #include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock * if program is allowed to access maps, so check rcu_read_lock_held() or * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_PTR_TO_MAP_VALUE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return map->ops->map_delete_elem(map, key); } const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) { return map->ops->map_push_elem(map, value, flags); } const struct bpf_func_proto bpf_map_push_elem_proto = { .func = bpf_map_push_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) { return map->ops->map_pop_elem(map, value); } const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) { return map->ops->map_peek_elem(map, value); } const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { WARN_ON_ONCE(!bpf_rcu_lock_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { .func = bpf_map_lookup_percpu_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, .allow_fastcall = true, }; BPF_CALL_0(bpf_get_numa_node_id) { return numa_node_id(); } const struct bpf_func_proto bpf_get_numa_node_id_proto = { .func = bpf_get_numa_node_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_boot_ns) { /* NMI safe access to clock boottime */ return ktime_get_boot_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .func = bpf_ktime_get_boot_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_coarse_ns) { return ktime_get_coarse_ns(); } const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .func = bpf_ktime_get_coarse_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_tai_ns) { /* NMI safe access to clock tai */ return ktime_get_tai_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { .func = bpf_ktime_get_tai_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; } const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .func = bpf_get_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; kgid_t gid; if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .func = bpf_get_current_uid_gid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; if (unlikely(!task)) goto err_clear; /* Verifier guarantees that size > 0 */ strscpy_pad(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; union { __u32 val; arch_spinlock_t lock; } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); preempt_disable(); arch_spin_lock(l); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; arch_spin_unlock(l); preempt_enable(); } #else static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); do { atomic_cond_read_relaxed(l, !VAL); } while (atomic_xchg(l, 1)); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; atomic_set_release(l, 0); } #endif static DEFINE_PER_CPU(unsigned long, irqsave_flags); static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); } NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; } const struct bpf_func_proto bpf_spin_lock_proto = { .func = bpf_spin_lock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); } NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; } const struct bpf_func_proto bpf_spin_unlock_proto = { .func = bpf_spin_unlock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src) { struct bpf_spin_lock *lock; if (lock_src) lock = src + map->record->spin_lock_off; else lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } BPF_CALL_0(bpf_jiffies64) { return get_jiffies_64(); } const struct bpf_func_proto bpf_jiffies64_proto = { .func = bpf_jiffies64, .gpl_only = false, .ret_type = RET_INTEGER, }; #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); cgrp_id = cgroup_id(cgrp); rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .func = bpf_get_current_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { struct cgroup *cgrp; struct cgroup *ancestor; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); cgrp_id = ancestor ? cgroup_id(ancestor) : 0; rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .func = bpf_get_current_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; #endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, unsigned long long *res, bool *is_negative) { unsigned int base = flags & BPF_STRTOX_BASE_MASK; const char *cur_buf = buf; size_t cur_len = buf_len; unsigned int consumed; size_t val_len; char str[64]; if (!buf || !buf_len || !res || !is_negative) return -EINVAL; if (base != 0 && base != 8 && base != 10 && base != 16) return -EINVAL; if (flags & ~BPF_STRTOX_BASE_MASK) return -EINVAL; while (cur_buf < buf + buf_len && isspace(*cur_buf)) ++cur_buf; *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); if (*is_negative) ++cur_buf; consumed = cur_buf - buf; cur_len -= consumed; if (!cur_len) return -EINVAL; cur_len = min(cur_len, sizeof(str) - 1); memcpy(str, cur_buf, cur_len); str[cur_len] = '\0'; cur_buf = str; cur_buf = _parse_integer_fixup_radix(cur_buf, &base); val_len = _parse_integer(cur_buf, base, res); if (val_len & KSTRTOX_OVERFLOW) return -ERANGE; if (val_len == 0) return -EINVAL; cur_buf += val_len; consumed += cur_buf - str; return consumed; } static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, long long *res) { unsigned long long _res; bool is_negative; int err; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) { if ((long long)-_res > 0) return -ERANGE; *res = -_res; } else { if ((long long)_res < 0) return -ERANGE; *res = _res; } return err; } BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, s64 *, res) { long long _res; int err; *res = 0; err = __bpf_strtoll(buf, buf_len, flags, &_res); if (err < 0) return err; *res = _res; return err; } const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, u64 *, res) { unsigned long long _res; bool is_negative; int err; *res = 0; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) return -EINVAL; *res = _res; return err; } const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { return strncmp(s1, s2, s1_sz); } static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_CONST_STR, }; BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { struct task_struct *task = current; struct pid_namespace *pidns; int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_pidns_info))) goto clear; if (unlikely((u64)(dev_t)dev != dev)) goto clear; if (unlikely(!task)) goto clear; pidns = task_active_pid_ns(task); if (unlikely(!pidns)) { err = -ENOENT; goto clear; } if (!ns_match(&pidns->ns, (dev_t)dev, ino)) goto clear; nsdata->pid = task_pid_nr_ns(task, pidns); nsdata->tgid = task_tgid_nr_ns(task, pidns); return 0; clear: memset((void *)nsdata, 0, (size_t) size); return err; } const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .func = bpf_get_ns_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, void *, data, u64, size) { if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const void __user *, user_ptr) { int ret = copy_from_user(dst, user_ptr, size); if (unlikely(ret)) { memset(dst, 0, size); ret = -EFAULT; } return ret; } const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) { int ret; /* flags is not used yet */ if (unlikely(flags)) return -EINVAL; if (unlikely(!size)) return 0; ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); if (ret == size) return 0; memset(dst, 0, size); /* Return -EFAULT for partial read */ return ret < 0 ? ret : -EFAULT; } const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_BTF_ID, .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg5_type = ARG_ANYTHING }; BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) return (unsigned long)NULL; return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu); } const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) { return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr); } const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, size_t bufsz) { void __user *user_ptr = (__force void __user *)unsafe_ptr; buf[0] = 0; switch (fmt_ptype) { case 's': #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)unsafe_ptr < TASK_SIZE) return strncpy_from_user_nofault(buf, user_ptr, bufsz); fallthrough; #endif case 'k': return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); case 'u': return strncpy_from_user_nofault(buf, user_ptr, bufsz); } return -EINVAL; } /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); return 0; } void bpf_put_buffers(void) { if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) { if (!data->bin_args && !data->buf) return; bpf_put_buffers(); } /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: * - Format string verification only: when data->get_bin_args is false * - Arguments preparation: in addition to the above verification, it writes in * data->bin_args a binary representation of arguments usable by bstr_printf * where pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; struct bpf_bprintf_buffers *buffers = NULL; size_t sizeof_cur_arg, sizeof_cur_ip; int err, i, num_spec = 0; u64 cur_arg; char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; fmt_end = strnchr(fmt, fmt_size, 0); if (!fmt_end) return -EINVAL; fmt_size = fmt_end - fmt; if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { if (num_args) tmp_buf = buffers->bin_args; tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; data->bin_args = (u32 *)tmp_buf; } if (data->get_buf) data->buf = buffers->buf; for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; goto out; } if (fmt[i] != '%') continue; if (fmt[i + 1] == '%') { i++; continue; } if (num_spec >= num_args) { err = -EINVAL; goto out; } /* The string is zero-terminated so if fmt[i] != 0, we can * always access fmt[i + 1], in the worst case it will be a 0 */ i++; /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || fmt[i] == ' ') i++; if (fmt[i] >= '1' && fmt[i] <= '9') { i++; while (fmt[i] >= '0' && fmt[i] <= '9') i++; } if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || ispunct(fmt[i + 1])) { if (tmp_buf) cur_arg = raw_args[num_spec]; goto nocopy_fmt; } if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; i += 2; goto fmt_str; } if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { if (tmp_buf) cur_arg = raw_args[num_spec]; i++; goto nocopy_fmt; } if (fmt[i + 1] == 'B') { if (tmp_buf) { err = snprintf(tmp_buf, (tmp_buf_end - tmp_buf), "%pB", (void *)(long)raw_args[num_spec]); tmp_buf += (err + 1); } i++; num_spec++; continue; } /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { err = -EINVAL; goto out; } i += 2; if (!tmp_buf) goto nocopy_fmt; sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, sizeof_cur_ip); if (err < 0) memset(cur_ip, 0, sizeof_cur_ip); /* hack: bstr_printf expects IP addresses to be * pre-formatted as strings, ironically, the easiest way * to do that is to call snprintf. */ ip_spec[2] = fmt[i - 1]; ip_spec[3] = fmt[i]; err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, ip_spec, &cur_ip); tmp_buf += err + 1; num_spec++; continue; } else if (fmt[i] == 's') { fmt_ptype = fmt[i]; fmt_str: if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) { err = -EINVAL; goto out; } if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, fmt_ptype, tmp_buf_end - tmp_buf); if (err < 0) { tmp_buf[0] = '\0'; err = 1; } tmp_buf += err; num_spec++; continue; } else if (fmt[i] == 'c') { if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } *tmp_buf = raw_args[num_spec]; tmp_buf++; num_spec++; continue; } sizeof_cur_arg = sizeof(int); if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long); i++; } if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long long); i++; } if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x' && fmt[i] != 'X') { err = -EINVAL; goto out; } if (tmp_buf) cur_arg = raw_args[num_spec]; nocopy_fmt: if (tmp_buf) { tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { err = -ENOSPC; goto out; } if (sizeof_cur_arg == 8) { *(u32 *)tmp_buf = *(u32 *)&cur_arg; *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); } else { *(u32 *)tmp_buf = (u32)(long)cur_arg; } tmp_buf += sizeof_cur_arg; } num_spec++; } err = 0; out: if (err) bpf_bprintf_cleanup(data); return err; } BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, }; int err, num_args; if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data); if (err < 0) return err; err = bstr_printf(str, str_size, fmt, data.bin_args); bpf_bprintf_cleanup(&data); return err + 1; } const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) { if (map->map_type == BPF_MAP_TYPE_ARRAY) { struct bpf_array *array = container_of(map, struct bpf_array, map); *arr_idx = ((char *)value - array->value) / array->elem_size; return arr_idx; } return (void *)value - round_up(map->key_size, 8); } enum bpf_async_type { BPF_ASYNC_TYPE_TIMER = 0, BPF_ASYNC_TYPE_WQ, }; enum bpf_async_op { BPF_ASYNC_START, BPF_ASYNC_CANCEL }; struct bpf_async_cmd { struct llist_node node; u64 nsec; u32 mode; enum bpf_async_op op; }; struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; struct rcu_head rcu; u64 flags; struct irq_work worker; refcount_t refcnt; enum bpf_async_type type; struct llist_head async_cmds; }; /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. * That space is used to keep 'struct bpf_async_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. * bpf_timer_start() arms the timer. * If user space reference to a map goes to zero at this point * ops->map_release_uref callback is responsible for cancelling the timers, * freeing their memory, and decrementing prog's refcnts. * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. * Inner maps can contain bpf timers as well. ops->map_release_uref is * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; atomic_t cancelling; }; struct bpf_work { struct bpf_async_cb cb; struct work_struct work; }; /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ struct bpf_async_kern { union { struct bpf_async_cb *cb; struct bpf_hrtimer *timer; struct bpf_work *work; }; } __attribute__((aligned(8))); static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static void bpf_async_refcount_put(struct bpf_async_cb *cb); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); struct bpf_map *map = t->cb.map; void *value = t->cb.value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and * cannot be preempted by another bpf_timer_cb() on the same cpu. * Remember the timer this callback is servicing to prevent * deadlock if callback_fn() calls bpf_timer_cancel() or * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ this_cpu_write(hrtimer_running, NULL); out: return HRTIMER_NORESTART; } static void bpf_wq_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, work); struct bpf_async_cb *cb = &w->cb; struct bpf_map *map = cb->map; bpf_callback_t callback_fn; void *value = cb->value; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_wq); callback_fn = READ_ONCE(cb->callback_fn); if (!callback_fn) return; key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); migrate_enable(); rcu_read_unlock_trace(); } static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); /* * Drop the last reference to prog only after RCU GP, as set_callback() * may race with cancel_and_free() */ if (cb->prog) bpf_prog_put(cb->prog); kfree_nolock(cb); } /* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); struct bpf_work *w = container_of(cb, struct bpf_work, cb); bool retry = false; /* * bpf_async_cancel_and_free() tried to cancel timer/wq, but it * could have raced with timer/wq_start. Now refcnt is zero and * srcu/rcu GP completed. Cancel timer/wq again. */ switch (cb->type) { case BPF_ASYNC_TYPE_TIMER: if (hrtimer_try_to_cancel(&t->timer) < 0) retry = true; break; case BPF_ASYNC_TYPE_WQ: if (!cancel_work(&w->work) && work_busy(&w->work)) retry = true; break; } if (retry) { /* * hrtimer or wq callback may still be running. It must be * in rcu_tasks_trace or rcu CS, so wait for GP again. * It won't retry forever, since refcnt zero prevents all * operations on timer/wq. */ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); return; } /* RCU Tasks Trace grace period implies RCU grace period. */ bpf_async_cb_rcu_free(rcu); } static void worker_for_call_rcu(struct irq_work *work) { struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); } static void bpf_async_refcount_put(struct bpf_async_cb *cb) { if (!refcount_dec_and_test(&cb->refcnt)) return; if (irqs_disabled()) { cb->worker = IRQ_WORK_INIT(worker_for_call_rcu); irq_work_queue(&cb->worker); } else { call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); } } static void bpf_async_cancel_and_free(struct bpf_async_kern *async); static void bpf_async_irq_worker(struct irq_work *work); static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; switch (type) { case BPF_ASYNC_TYPE_TIMER: size = sizeof(struct bpf_hrtimer); break; case BPF_ASYNC_TYPE_WQ: size = sizeof(struct bpf_work); break; default: return -EINVAL; } old_cb = READ_ONCE(async->cb); if (old_cb) return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: w = (struct bpf_work *)cb; INIT_WORK(&w->work, bpf_wq_work); cb->value = (void *)async - map->record->wq_off; break; } cb->map = map; cb->prog = NULL; cb->flags = flags; cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker); init_llist_head(&cb->async_cmds); refcount_set(&cb->refcnt, 1); /* map's reference */ cb->type = type; rcu_assign_pointer(cb->callback_fn, NULL); old_cb = cmpxchg(&async->cb, NULL, cb); if (old_cb) { /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ kfree_nolock(cb); return -EBUSY; } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. */ smp_mb(); if (!atomic64_read(&map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. */ bpf_async_cancel_and_free(async); return -EPERM; } return 0; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, u64, flags) { clock_t clockid = flags & (MAX_CLOCKS - 1); BUILD_BUG_ON(MAX_CLOCKS != 16); BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer)); BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer)); if (flags >= MAX_CLOCKS || /* similar to timerfd except _ALARM variants are not supported */ (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_BOOTTIME)) return -EINVAL; return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER); } static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, struct bpf_prog *prog, void *callback_fn) { struct bpf_prog *prev; /* Acquire a guard reference on prog to prevent it from being freed during the loop */ if (prog) { prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) return PTR_ERR(prog); } do { if (prog) prog = bpf_prog_inc_not_zero(prog); prev = xchg(&cb->prog, prog); rcu_assign_pointer(cb->callback_fn, callback_fn); /* * Release previous prog, make sure that if other CPU is contending, * to set bpf_prog, references are not leaked as each iteration acquires and * releases one reference. */ if (prev) bpf_prog_put(prev); } while (READ_ONCE(cb->prog) != prog || (void __force *)READ_ONCE(cb->callback_fn) != callback_fn); if (prog) bpf_prog_put(prog); return 0; } static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running); static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { /* * Do not schedule another operation on this cpu if it's in irq_work * callback that is processing async_cmds queue. Otherwise the following * loop is possible: * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). */ if (this_cpu_read(async_cb_running) == cb) { bpf_async_refcount_put(cb); return -EDEADLK; } struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { bpf_async_refcount_put(cb); return -ENOMEM; } init_llist_node(&cmd->node); cmd->nsec = nsec; cmd->mode = timer_mode; cmd->op = op; if (llist_add(&cmd->node, &cb->async_cmds)) irq_work_queue(&cb->worker); return 0; } static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { struct bpf_async_cb *cb; cb = READ_ONCE(async->cb); if (!cb) return -EINVAL; return bpf_async_update_prog_callback(cb, prog, callback_fn); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { return __bpf_async_set_callback(timer, callback_fn, aux->prog); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_PTR_TO_FUNC, }; static bool defer_timer_wq_op(void) { return in_hardirq() || irqs_disabled(); } BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; u32 mode; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; t = READ_ONCE(async->timer); if (!t || !READ_ONCE(t->cb.prog)) return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; else mode = HRTIMER_MODE_REL_SOFT; if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; /* * bpf_async_cancel_and_free() could have dropped refcnt to zero. In * such case BPF progs are not allowed to arm the timer to prevent UAF. */ if (!refcount_inc_not_zero(&t->cb.refcnt)) return -ENOENT; if (!defer_timer_wq_op()) { hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); bpf_async_refcount_put(&t->cb); return 0; } else { return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode); } } static const struct bpf_func_proto bpf_timer_start_proto = { .func = bpf_timer_start, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) { struct bpf_hrtimer *t, *cur_t; bool inc = false; int ret = 0; if (defer_timer_wq_op()) return -EOPNOTSUPP; t = READ_ONCE(async->timer); if (!t) return -EINVAL; cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ return -EDEADLK; } /* Only account in-flight cancellations when invoked from a timer * callback, since we want to avoid waiting only if other _callbacks_ * are waiting on us, to avoid introducing lockups. Non-callback paths * are ok, since nobody would synchronously wait for their completion. */ if (!cur_t) goto drop; atomic_inc(&t->cancelling); /* Need full barrier after relaxed atomic_inc */ smp_mb__after_atomic(); inc = true; if (atomic_read(&cur_t->cancelling)) { /* We're cancelling timer t, while some other timer callback is * attempting to cancel us. In such a case, it might be possible * that timer t belongs to the other callback, or some other * callback waiting upon it (creating transitive dependencies * upon us), and we will enter a deadlock if we continue * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ atomic_dec(&t->cancelling); return -EDEADLK; } drop: bpf_async_update_prog_callback(&t->cb, NULL, NULL); /* Cancel the timer and wait for associated callback to finish * if it was running. */ ret = hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); return ret; } static const struct bpf_func_proto bpf_timer_cancel_proto = { .func = bpf_timer_cancel, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, }; static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op, u64 timer_nsec, u32 timer_mode) { switch (cb->type) { case BPF_ASYNC_TYPE_TIMER: { struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); switch (op) { case BPF_ASYNC_START: hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode); break; case BPF_ASYNC_CANCEL: hrtimer_try_to_cancel(&t->timer); break; } break; } case BPF_ASYNC_TYPE_WQ: { struct bpf_work *w = container_of(cb, struct bpf_work, cb); switch (op) { case BPF_ASYNC_START: schedule_work(&w->work); break; case BPF_ASYNC_CANCEL: cancel_work(&w->work); break; } break; } } bpf_async_refcount_put(cb); } static void bpf_async_irq_worker(struct irq_work *work) { struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); struct llist_node *pos, *n, *list; list = llist_del_all(&cb->async_cmds); if (!list) return; list = llist_reverse_order(list); this_cpu_write(async_cb_running, cb); llist_for_each_safe(pos, n, list) { struct bpf_async_cmd *cmd; cmd = container_of(pos, struct bpf_async_cmd, node); bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); kfree_nolock(cmd); } this_cpu_write(async_cb_running, NULL); } static void bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; if (!READ_ONCE(async->cb)) return; cb = xchg(&async->cb, NULL); if (!cb) return; bpf_async_update_prog_callback(cb, NULL, NULL); /* * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last * refcnt. Either synchronously or asynchronously in irq_work. */ if (!defer_timer_wq_op()) { bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); /* * bpf_async_schedule_op() either enqueues allocated cmd into llist * or fails with ENOMEM and drop the last refcnt. * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free() * callback will do additional timer/wq_cancel due to races anyway. */ } } /* * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { bpf_async_cancel_and_free(val); } /* * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_wq_cancel_and_free(void *val) { bpf_async_cancel_and_free(val); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) { unsigned long *kptr = dst; /* This helper may be inlined by verifier. */ return xchg(kptr, (unsigned long)ptr); } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to * denote type that verifier will determine. */ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, .arg1_type = ARG_KPTR_XCHG_DEST, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; struct bpf_dynptr_file_impl { struct freader freader; /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */ u64 offset; u64 size; }; /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ #define DYNPTR_MAX_SIZE ((1UL << 24) - 1) #define DYNPTR_TYPE_SHIFT 28 #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { ptr->size |= DYNPTR_RDONLY_BIT; } static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) { return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { struct bpf_dynptr_file_impl *df = ptr->data; return df->size; } return ptr->size & DYNPTR_SIZE_MASK; } static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off) { if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { struct bpf_dynptr_file_impl *df = ptr->data; df->offset += off; return; } ptr->offset += off; } static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) { struct bpf_dynptr_file_impl *df = ptr->data; df->size = new_size; return; } ptr->size = (u32)new_size | metadata; } int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len) { const void *ptr; if (!buf) return -EINVAL; df->freader.buf = buf; df->freader.buf_sz = len; ptr = freader_fetch(&df->freader, offset + df->offset, len); if (!ptr) return df->freader.err; if (ptr != buf) /* Force copying into the buffer */ memcpy(buf, ptr, len); return 0; } void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { ptr->data = data; ptr->offset = offset; ptr->size = size; bpf_dynptr_set_type(ptr, type); } void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { memset(ptr, 0, sizeof(*ptr)); } BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; BTF_TYPE_EMIT(struct bpf_dynptr); err = bpf_dynptr_check_size(size); if (err) goto error; /* flags is currently unsupported */ if (flags) { err = -EINVAL; goto error; } bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); return 0; error: bpf_dynptr_set_null(ptr); return err; } static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); if (err) return err; type = bpf_dynptr_get_type(src); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst, src->data + src->offset + offset, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); return 0; case BPF_DYNPTR_TYPE_FILE: return bpf_file_fetch_bytes(src->data, offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags) { enum bpf_dynptr_type type; int err; if (!dst->data || __bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); if (err) return err; type = bpf_dynptr_get_type(dst); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: if (flags) return -EINVAL; /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst->data + dst->offset + offset, src, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, flags); case BPF_DYNPTR_TYPE_XDP: if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, len, flags); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; if (!ptr->data) return 0; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return 0; if (__bpf_dynptr_is_rdonly(ptr)) return 0; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); return 0; } } static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto bpf_perf_event_read_proto __weak; const struct bpf_func_proto bpf_send_signal_proto __weak; const struct bpf_func_proto bpf_send_signal_thread_proto __weak; const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; const struct bpf_func_proto bpf_get_task_stack_proto __weak; const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; case BPF_FUNC_map_update_elem: return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; case BPF_FUNC_map_push_elem: return &bpf_map_push_elem_proto; case BPF_FUNC_map_pop_elem: return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; case BPF_FUNC_map_lookup_percpu_elem: return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_get_numa_node_id: return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ktime_get_tai_ns: return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: return &bpf_ringbuf_reserve_proto; case BPF_FUNC_ringbuf_submit: return &bpf_ringbuf_submit_proto; case BPF_FUNC_ringbuf_discard: return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; case BPF_FUNC_strtol: return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; default: break; } if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return NULL; switch (func_id) { case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; case BPF_FUNC_timer_init: return &bpf_timer_init_proto; case BPF_FUNC_timer_set_callback: return &bpf_timer_set_callback_proto; case BPF_FUNC_timer_start: return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_user_ringbuf_drain: return &bpf_user_ringbuf_drain_proto; case BPF_FUNC_ringbuf_reserve_dynptr: return &bpf_ringbuf_reserve_dynptr_proto; case BPF_FUNC_ringbuf_submit_dynptr: return &bpf_ringbuf_submit_dynptr_proto; case BPF_FUNC_ringbuf_discard_dynptr: return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; case BPF_FUNC_dynptr_read: return &bpf_dynptr_read_proto; case BPF_FUNC_dynptr_write: return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_cgrp_storage_get: return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; default: break; } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_copy_from_user: return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; case BPF_FUNC_task_pt_regs: return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); case BPF_FUNC_perf_event_read_value: return bpf_get_perf_event_read_value_proto(); case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_send_signal: return &bpf_send_signal_proto; case BPF_FUNC_send_signal_thread: return &bpf_send_signal_thread_proto; case BPF_FUNC_get_task_stack: return prog->sleepable ? &bpf_get_task_stack_sleepable_proto : &bpf_get_task_stack_proto; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; case BPF_FUNC_find_vma: return &bpf_find_vma_proto; default: return NULL; } } EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { struct list_head *head = list_head, *orig_head = list_head; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up * executing on entry/exit of functions called inside the critical * section, and end up doing map ops that call bpf_list_head_free for * the same map value again. */ __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; head = head->next; unlock: INIT_LIST_HEAD(orig_head); __bpf_spin_unlock_irqrestore(spin_lock); while (head != orig_head) { void *obj = head; obj -= field->graph_root.node_offset; head = head->next; /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are * 'rb_node *', so field name of rb_node within containing struct is not * needed. * * Since bpf_rb_tree's node type has a corresponding struct btf_field with * graph_root.node_offset, it's not necessary to know field name * or type of node struct */ #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ for (pos = rb_first_postorder(root); \ pos && ({ n = rb_next_postorder(pos); 1; }); \ pos = n) void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; struct rb_node *pos, *n; void *obj; BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } __bpf_kfunc_start_defs(); /** * bpf_obj_new() - allocate an object described by program BTF * @local_type_id__k: type ID in program BTF * @meta: verifier-supplied struct metadata * * Allocate an object of the type identified by @local_type_id__k and * initialize its special fields. BPF programs can use * bpf_core_type_id_local() to provide @local_type_id__k. The verifier * rewrites @meta; BPF programs do not set it. * * Return: Pointer to the allocated object, or %NULL on failure. */ __bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta) { u64 size = local_type_id__k; void *p; p = bpf_mem_alloc(&bpf_global_ma, size); if (!p) return NULL; if (meta) bpf_obj_init(meta->record, p); return p; } __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { return bpf_obj_new(local_type_id__k, meta__ign); } /** * bpf_percpu_obj_new() - allocate a percpu object described by program BTF * @local_type_id__k: type ID in program BTF * @meta: verifier-supplied struct metadata * * Allocate a percpu object of the type identified by @local_type_id__k. BPF * programs can use bpf_core_type_id_local() to provide @local_type_id__k. * The verifier rewrites @meta; BPF programs do not set it. * * Return: Pointer to the allocated percpu object, or %NULL on failure. */ __bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta) { u64 size = local_type_id__k; /* The verifier has ensured that meta must be NULL */ return bpf_mem_alloc(&bpf_global_percpu_ma, size); } __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) { return bpf_percpu_obj_new(local_type_id__k, meta__ign); } /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { struct bpf_mem_alloc *ma; if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 * refcount. Return without freeing the object */ return; } if (rec) bpf_obj_free_fields(rec, p); if (percpu) ma = &bpf_global_percpu_ma; else ma = &bpf_global_ma; bpf_mem_free_rcu(ma, p); } /** * bpf_obj_drop() - drop a previously allocated object * @p__alloc: object to free * @meta: verifier-supplied struct metadata * * Destroy special fields in @p__alloc as needed and free the object. The * verifier rewrites @meta; BPF programs do not set it. */ __bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta) { void *p = p__alloc; __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) { return bpf_obj_drop(p__alloc, meta__ign); } /** * bpf_percpu_obj_drop() - drop a previously allocated percpu object * @p__alloc: percpu object to free * @meta: verifier-supplied struct metadata * * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it. */ __bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta) { /* The verifier has ensured that meta must be NULL */ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) { bpf_percpu_obj_drop(p__alloc, meta__ign); } /** * bpf_refcount_acquire() - turn a local kptr into an owning reference * @p__refcounted_kptr: non-owning local kptr * @meta: verifier-supplied struct metadata * * Increment the refcount for @p__refcounted_kptr. The verifier rewrites * @meta; BPF programs do not set it. * * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure. */ __bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta) { struct bpf_refcount *ref; /* Could just cast directly to refcount_t *, but need some code using * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); if (!refcount_inc_not_zero((refcount_t *)ref)) return NULL; /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null * in verifier.c */ return (void *)p__refcounted_kptr; } __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) { return bpf_refcount_acquire(p__refcounted_kptr, meta__ign); } static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); WRITE_ONCE(node->owner, head); return 0; } /** * bpf_list_push_front() - add a node to the front of a BPF linked list * @head: list head * @node: node to insert * @meta: verifier-supplied struct metadata * @off: verifier-supplied offset of @node within the containing object * * Insert @node at the front of @head. The verifier rewrites @meta and @off; * BPF programs do not set them. * * Return: 0 on success, or %-EINVAL if @node is already linked. */ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node, struct btf_struct_meta *meta, u64 off) { struct bpf_list_node_kern *n = (void *)node; return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { return bpf_list_push_front(head, node, meta__ign, off); } /** * bpf_list_push_back() - add a node to the back of a BPF linked list * @head: list head * @node: node to insert * @meta: verifier-supplied struct metadata * @off: verifier-supplied offset of @node within the containing object * * Insert @node at the back of @head. The verifier rewrites @meta and @off; * BPF programs do not set them. * * Return: 0 on success, or %-EINVAL if @node is already linked. */ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node, struct btf_struct_meta *meta, u64 off) { struct bpf_list_node_kern *n = (void *)node; return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { return bpf_list_push_back(head, node, meta__ign, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { return __bpf_list_del(head, false); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { return __bpf_list_del(head, true); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; if (list_empty(h) || unlikely(!h->next)) return NULL; return (struct bpf_list_node *)h->next; } __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; if (list_empty(h) || unlikely(!h->next)) return NULL; return (struct bpf_list_node *)h->prev; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; struct rb_node *n = &node_internal->rb_node; /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) */ if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } while (*link) { parent = *link; if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); WRITE_ONCE(node->owner, root); return 0; } /** * bpf_rbtree_add() - add a node to a BPF rbtree * @root: tree root * @node: node to insert * @less: comparator used to order nodes * @meta: verifier-supplied struct metadata * @off: verifier-supplied offset of @node within the containing object * * Insert @node into @root using @less. The verifier rewrites @meta and @off; * BPF programs do not set them. * * Return: 0 on success, or %-EINVAL if @node is already linked in a tree. */ __bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), struct btf_struct_meta *meta, u64 off) { struct bpf_rb_node_kern *n = (void *)node; return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), void *meta__ign, u64 off) { return bpf_rbtree_add(root, node, less, meta__ign, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)rb_first_cached(r); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)r->rb_root.rb_node; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; if (READ_ONCE(node_internal->owner) != root) return NULL; return (struct bpf_rb_node *)node_internal->rb_node.rb_left; } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; if (READ_ONCE(node_internal->owner) != root) return NULL; return (struct bpf_rb_node *)node_internal->rb_node.rb_right; } /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling * bpf_task_release(). * @p: The task on which a reference is being acquired. */ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) { if (refcount_inc_not_zero(&p->rcu_users)) return p; return NULL; } /** * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { put_task_struct_rcu_user(p); } __bpf_kfunc void bpf_task_release_dtor(void *p) { put_task_struct_rcu_user(p); } CFI_NOSEAL(bpf_task_release_dtor); #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by * this kfunc which is not stored in a map as a kptr, must be released by * calling bpf_cgroup_release(). * @cgrp: The cgroup on which a reference is being acquired. */ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) { return cgroup_tryget(cgrp) ? cgrp : NULL; } /** * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to * not be freed until the current grace period has ended, even if its refcount * drops to 0. * @cgrp: The cgroup on which a reference is being released. */ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) { cgroup_put(cgrp); } __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) { cgroup_put(cgrp); } CFI_NOSEAL(bpf_cgroup_release_dtor); /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a * map, must be released by calling bpf_cgroup_release(). * @cgrp: The cgroup for which we're performing a lookup. * @level: The level of ancestor to look up. */ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) { struct cgroup *ancestor; if (level > cgrp->level || level < 0) return NULL; /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */ ancestor = cgrp->ancestors[level]; if (!cgroup_tryget(ancestor)) return NULL; return ancestor; } /** * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this * kfunc which is not subsequently stored in a map, must be released by calling * bpf_cgroup_release(). * @cgid: cgroup id. */ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; } /** * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test * task's membership of cgroup ancestry. * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { long ret; rcu_read_lock(); ret = task_under_cgroup_hierarchy(task, ancestor); rcu_read_unlock(); return ret; } BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return task_under_cgroup_hierarchy(current, cgrp); } const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .func = bpf_current_task_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @task: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returen. On failure, NULL is returned. */ __bpf_kfunc struct cgroup * bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) { struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); if (IS_ERR(cgrp)) return NULL; return cgrp; } #endif /* CONFIG_CGROUPS */ /** * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up * in the root pid namespace idr. If a task is returned, it must either be * stored in a map, or released with bpf_task_release(). * @pid: The pid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up * in the pid namespace of the current task. If a task is returned, it must * either be stored in a map, or released with bpf_task_release(). * @vpid: The vpid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. * * The user must check that the returned pointer is not null before using it. * * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; u64 len = buffer__szk; int err; if (!ptr->data) return NULL; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return NULL; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: if (buffer__nullable) return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; if (!buffer__nullable) return NULL; bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false); return buffer__nullable; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); case BPF_DYNPTR_TYPE_FILE: err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk); return err ? NULL : buffer__nullable; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; } } /** * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct * data pointer to (example: the requested slice is to the paged area of an skb * packet). In the case where the returned pointer is to the buffer, the user * is responsible for persisting writes through calling bpf_dynptr_write(). This * usually looks something like this pattern: * * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); * if (!eth) * return TC_ACT_SHOT; * * // mutate eth header // * * if (eth == buffer) * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); * * Please note that, as in the example above, the user must check that the * returned pointer is not null before using it. * * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer * if the bpf program allows skb data writes. There are two possibilities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the * returned pointer is directly to skb data, and if the skb is cloned, the * verifier will have uncloned it (see bpf_unclone_prologue()) already. * The pointer can be directly written into. * * 2) Some portion of the requested slice is in the paged buffer area. * In this case, the requested data will be copied out into the buffer * and the returned pointer will be a pointer to the buffer. The skb * will not be pulled. To persist the write, the user will need to call * bpf_dynptr_write(), which will pull the skb and commit the write. * * Similarly for xdp programs, if the requested slice is not across xdp * fragments, then a direct pointer will be returned, otherwise the data * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 size; if (!ptr->data || start > end) return -EINVAL; size = __bpf_dynptr_size(ptr); if (start > size || end > size) return -ERANGE; bpf_dynptr_advance_offset(ptr, start); bpf_dynptr_set_size(ptr, end - start); return 0; } __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; return __bpf_dynptr_is_rdonly(ptr); } __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; return __bpf_dynptr_size(ptr); } __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); return -EINVAL; } *clone = *ptr; return 0; } /** * bpf_dynptr_copy() - Copy data from one dynptr to another. * @dst_ptr: Destination dynptr - where data should be copied to * @dst_off: Offset into the destination dynptr * @src_ptr: Source dynptr - where data should be copied from * @src_off: Offset into the source dynptr * @size: Length of the data to copy from source to destination * * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); if (src_slice && dst_slice) { memmove(dst_slice, src_slice, size); return 0; } if (src_slice) return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); if (dst_slice) return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); if (bpf_dynptr_check_off_len(dst, dst_off, size) || bpf_dynptr_check_off_len(src, src_off, size)) return -E2BIG; off = 0; while (off < size) { u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); if (err) return err; err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); if (err) return err; off += chunk_sz; } return 0; } /** * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. * @p: Destination dynptr - where data will be filled * @offset: Offset into the dynptr to start filling from * @size: Number of bytes to fill * @val: Constant byte to fill the memory with * * Fills the @size bytes of the memory area pointed to by @p * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 chunk_sz, write_off; char buf[256]; void* slice; int err; slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); if (likely(slice)) { memset(slice, val, size); return 0; } if (__bpf_dynptr_is_rdonly(ptr)) return -EINVAL; err = bpf_dynptr_check_off_len(ptr, offset, size); if (err) return err; /* Non-linear data under the dynptr, write from a local buffer */ chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; } return 0; } __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; } __bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k) { return (void *)obj__ign; } __bpf_kfunc void bpf_rcu_read_lock(void) { rcu_read_lock(); } __bpf_kfunc void bpf_rcu_read_unlock(void) { rcu_read_unlock(); } struct bpf_throw_ctx { struct bpf_prog_aux *aux; u64 sp; u64 bp; int cnt; }; static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) { struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; /* * The RCU read lock is held to safely traverse the latch tree, but we * don't need its protection when accessing the prog, since it has an * active stack frame on the current stack trace, and won't disappear. */ rcu_read_lock(); prog = bpf_prog_ksym_find(ip); rcu_read_unlock(); if (!prog) return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; ctx->aux = prog->aux; ctx->sp = sp; ctx->bp = bp; return false; } __bpf_kfunc void bpf_throw(u64 cookie) { struct bpf_throw_ctx ctx = {}; arch_bpf_stack_walk(bpf_stack_walker, &ctx); WARN_ON_ONCE(!ctx.aux); if (ctx.aux) WARN_ON_ONCE(!ctx.aux->exception_boundary); WARN_ON_ONCE(!ctx.bp); WARN_ON_ONCE(!ctx.cnt); /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning * deeper stack depths than ctx.sp as we do not return from bpf_throw, * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_map *map = p__map; BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq)); BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq)); if (flags) return -EINVAL; return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ); } __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; if (flags) return -EINVAL; w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; if (!refcount_inc_not_zero(&w->cb.refcnt)) return -ENOENT; if (!defer_timer_wq_op()) { schedule_work(&w->work); bpf_async_refcount_put(&w->cb); return 0; } else { return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0); } } __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, struct bpf_prog_aux *aux) { struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) return -EINVAL; return __bpf_async_set_callback(async, callback_fn, aux->prog); } __bpf_kfunc void bpf_preempt_disable(void) { preempt_disable(); } __bpf_kfunc void bpf_preempt_enable(void) { preempt_enable(); } struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); #define BITS_ITER_NR_WORDS_MAX 511 struct bpf_iter_bits_kern { union { __u64 *bits; __u64 bits_copy; }; int nr_bits; int bit; } __aligned(8); /* On 64-bit hosts, unsigned long and u64 have the same size, so passing * a u64 pointer and an unsigned long pointer to find_next_bit() will * return the same result, as both point to the same 8-byte area. * * For 32-bit little-endian hosts, using a u64 pointer or unsigned long * pointer also makes no difference. This is because the first iterated * unsigned long is composed of bits 0-31 of the u64 and the second unsigned * long is composed of bits 32-63 of the u64. * * However, for 32-bit big-endian hosts, this is not the case. The first * iterated unsigned long will be bits 32-63 of the u64, so swap these two * ulong values within the u64. */ static void swap_ulong_in_u64(u64 *bits, unsigned int nr) { #if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) unsigned int i; for (i = 0; i < nr; i++) bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); #endif } /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It * copies the data of the memory area to the newly created bpf_iter_bits @it for * subsequent iteration operations. * * On success, 0 is returned. On failure, ERR is returned. */ __bpf_kfunc int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) { struct bpf_iter_bits_kern *kit = (void *)it; u32 nr_bytes = nr_words * sizeof(u64); u32 nr_bits = BYTES_TO_BITS(nr_bytes); int err; BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits)); BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) != __alignof__(struct bpf_iter_bits)); kit->nr_bits = 0; kit->bits_copy = 0; kit->bit = -1; if (!unsafe_ptr__ign || !nr_words) return -EINVAL; if (nr_words > BITS_ITER_NR_WORDS_MAX) return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign); if (err) return -EFAULT; swap_ulong_in_u64(&kit->bits_copy, nr_words); kit->nr_bits = nr_bits; return 0; } if (bpf_mem_alloc_check_size(false, nr_bytes)) return -E2BIG; /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) return -ENOMEM; err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign); if (err) { bpf_mem_free(&bpf_global_ma, kit->bits); return err; } swap_ulong_in_u64(kit->bits, nr_words); kit->nr_bits = nr_bits; return 0; } /** * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits * @it: The bpf_iter_bits to be checked * * This function returns a pointer to a number representing the value of the * next bit in the bits. * * If there are no further bits available, it returns NULL. */ __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; int bit = kit->bit, nr_bits = kit->nr_bits; const void *bits; if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { kit->bit = bit; return NULL; } kit->bit = bit; return &kit->bit; } /** * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits * @it: The bpf_iter_bits to be destroyed * * Destroy the resource associated with the bpf_iter_bits. */ __bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; if (kit->nr_bits <= 64) return; bpf_mem_free(&bpf_global_ma, kit->bits); } /** * bpf_copy_from_user_str() - Copy a string from an unsafe user address * @dst: Destination address, in kernel space. This buffer must be * at least @dst__sz bytes long. * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. * @unsafe_ptr__ign: Source address, in user space. * @flags: The only supported flag is BPF_F_PAD_ZEROS * * Copies a NUL-terminated string from userspace to BPF space. If user string is * too long this will still ensure zero termination in the dst buffer unless * buffer size is 0. * * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and * memset all of @dst on failure. */ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags) { int ret; if (unlikely(flags & ~BPF_F_PAD_ZEROS)) return -EINVAL; if (unlikely(!dst__sz)) return 0; ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1); if (ret < 0) { if (flags & BPF_F_PAD_ZEROS) memset((char *)dst, 0, dst__sz); return ret; } if (flags & BPF_F_PAD_ZEROS) memset((char *)dst + ret, 0, dst__sz - ret); else ((char *)dst)[ret] = '\0'; return ret + 1; } /** * bpf_copy_from_user_task_str() - Copy a string from an task's address space * @dst: Destination address, in kernel space. This buffer must be * at least @dst__sz bytes long. * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. * @unsafe_ptr__ign: Source address in the task's address space. * @tsk: The task whose address space will be used * @flags: The only supported flag is BPF_F_PAD_ZEROS * * Copies a NUL terminated string from a task's address space to @dst__sz * buffer. If user string is too long this will still ensure zero termination * in the @dst__sz buffer unless buffer size is 0. * * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success * and memset all of @dst__sz on failure. * * Return: The number of copied bytes on success including the NUL terminator. * A negative error code on failure. */ __bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, struct task_struct *tsk, u64 flags) { int ret; if (unlikely(flags & ~BPF_F_PAD_ZEROS)) return -EINVAL; if (unlikely(dst__sz == 0)) return 0; ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); if (ret < 0) { if (flags & BPF_F_PAD_ZEROS) memset(dst, 0, dst__sz); return ret; } if (flags & BPF_F_PAD_ZEROS) memset(dst + ret, 0, dst__sz - ret); return ret + 1; } /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only * read and write the 4-bytes on 32-bit. */ __bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag) { local_irq_save(*flags__irq_flag); } __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) { local_irq_restore(*flags__irq_flag); } __bpf_kfunc void __bpf_trap(void) { } /* * Kfuncs for string operations. * * Since strings are not necessarily %NUL-terminated, we cannot directly call * in-kernel implementations. Instead, we open-code the implementations using * __get_kernel_nofault instead of plain dereference to make them safe. */ static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len) { char c1, c2; int i; if (!copy_from_kernel_nofault_allowed(s1, 1) || !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c1, s1, char, err_out); __get_kernel_nofault(&c2, s2, char, err_out); if (ignore_case) { c1 = tolower(c1); c2 = tolower(c2); } if (c1 != c2) return c1 < c2 ? -1 : 1; if (c1 == '\0') return 0; s1++; s2++; } return i == XATTR_SIZE_MAX ? -E2BIG : 0; err_out: return -EFAULT; } /** * bpf_strcmp - Compare two strings * @s1__ign: One string * @s2__ign: Another string * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) { return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX); } /** * bpf_strcasecmp - Compare two strings, ignoring the case of the characters * @s1__ign: One string * @s2__ign: Another string * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) { return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX); } /* * bpf_strncasecmp - Compare two length-limited strings, ignoring case * @s1__ign: One string * @s2__ign: Another string * @len: The maximum number of characters to compare * * Return: * * %0 - Strings are equal * * %-1 - @s1__ign is smaller * * %1 - @s2__ign is smaller * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of strings is too large * * %-ERANGE - One of strings is outside of kernel address space */ __bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len) { return __bpf_strncasecmp(s1__ign, s2__ign, true, len); } /** * bpf_strnchr - Find a character in a length limited string * @s__ign: The string to be searched * @count: The number of characters to be searched * @c: The character to search for * * Note that the %NUL-terminator is considered part of the string, and can * be searched for. * * Return: * * >=0 - Index of the first occurrence of @c within @s__ign * * %-ENOENT - @c not found in the first @count characters of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) { char sc; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == c) return i; if (sc == '\0') return -ENOENT; s__ign++; } return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; err_out: return -EFAULT; } /** * bpf_strchr - Find the first occurrence of a character in a string * @s__ign: The string to be searched * @c: The character to search for * * Note that the %NUL-terminator is considered part of the string, and can * be searched for. * * Return: * * >=0 - The index of the first occurrence of @c within @s__ign * * %-ENOENT - @c not found in @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strchr(const char *s__ign, char c) { return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); } /** * bpf_strchrnul - Find and return a character in a string, or end of string * @s__ign: The string to be searched * @c: The character to search for * * Return: * * >=0 - Index of the first occurrence of @c within @s__ign or index of * the null byte at the end of @s__ign when @c is not found * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) { char sc; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == '\0' || sc == c) return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strrchr - Find the last occurrence of a character in a string * @s__ign: The string to be searched * @c: The character to search for * * Return: * * >=0 - Index of the last occurrence of @c within @s__ign * * %-ENOENT - @c not found in @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strrchr(const char *s__ign, int c) { char sc; int i, last = -ENOENT; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&sc, s__ign, char, err_out); if (sc == c) last = i; if (sc == '\0') return last; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strnlen - Calculate the length of a length-limited string * @s__ign: The string * @count: The maximum number of characters to count * * Return: * * >=0 - The length of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) { char c; int i; if (!copy_from_kernel_nofault_allowed(s__ign, 1)) return -ERANGE; guard(pagefault)(); for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c, s__ign, char, err_out); if (c == '\0') return i; s__ign++; } return i == XATTR_SIZE_MAX ? -E2BIG : i; err_out: return -EFAULT; } /** * bpf_strlen - Calculate the length of a string * @s__ign: The string * * Return: * * >=0 - The length of @s__ign * * %-EFAULT - Cannot read @s__ign * * %-E2BIG - @s__ign is too large * * %-ERANGE - @s__ign is outside of kernel address space */ __bpf_kfunc int bpf_strlen(const char *s__ign) { return bpf_strnlen(s__ign, XATTR_SIZE_MAX); } /** * bpf_strspn - Calculate the length of the initial substring of @s__ign which * only contains letters in @accept__ign * @s__ign: The string to be searched * @accept__ign: The string to search for * * Return: * * >=0 - The length of the initial substring of @s__ign which only * contains letters from @accept__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) { char cs, ca; int i, j; if (!copy_from_kernel_nofault_allowed(s__ign, 1) || !copy_from_kernel_nofault_allowed(accept__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&cs, s__ign, char, err_out); if (cs == '\0') return i; for (j = 0; j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&ca, accept__ign + j, char, err_out); if (cs == ca || ca == '\0') break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (ca == '\0') return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strcspn - Calculate the length of the initial substring of @s__ign which * does not contain letters in @reject__ign * @s__ign: The string to be searched * @reject__ign: The string to search for * * Return: * * >=0 - The length of the initial substring of @s__ign which does not * contain letters from @reject__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) { char cs, cr; int i, j; if (!copy_from_kernel_nofault_allowed(s__ign, 1) || !copy_from_kernel_nofault_allowed(reject__ign, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&cs, s__ign, char, err_out); if (cs == '\0') return i; for (j = 0; j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&cr, reject__ign + j, char, err_out); if (cs == cr || cr == '\0') break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (cr != '\0') return i; s__ign++; } return -E2BIG; err_out: return -EFAULT; } static int __bpf_strnstr(const char *s1, const char *s2, size_t len, bool ignore_case) { char c1, c2; int i, j; if (!copy_from_kernel_nofault_allowed(s1, 1) || !copy_from_kernel_nofault_allowed(s2, 1)) { return -ERANGE; } guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2 + j, char, err_out); if (c2 == '\0') return i; /* * We allow reading an extra byte from s2 (note the * `i + j <= len` above) to cover the case when s2 is * a suffix of the first len chars of s1. */ if (i + j == len) break; __get_kernel_nofault(&c1, s1 + j, char, err_out); if (ignore_case) { c1 = tolower(c1); c2 = tolower(c2); } if (c1 == '\0') return -ENOENT; if (c1 != c2) break; } if (j == XATTR_SIZE_MAX) return -E2BIG; if (i + j == len) return -ENOENT; s1++; } return -E2BIG; err_out: return -EFAULT; } /** * bpf_strstr - Find the first substring in a string * @s1__ign: The string to be searched * @s2__ign: The string to search for * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within @s1__ign * * %-ENOENT - @s2__ign is not a substring of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false); } /** * bpf_strcasestr - Find the first substring in a string, ignoring the case of * the characters * @s1__ign: The string to be searched * @s2__ign: The string to search for * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within @s1__ign * * %-ENOENT - @s2__ign is not a substring of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign) { return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true); } /** * bpf_strnstr - Find the first substring in a length-limited string * @s1__ign: The string to be searched * @s2__ign: The string to search for * @len: the maximum number of characters to search * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within the first @len characters of @s1__ign * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) { return __bpf_strnstr(s1__ign, s2__ign, len, false); } /** * bpf_strncasestr - Find the first substring in a length-limited string, * ignoring the case of the characters * @s1__ign: The string to be searched * @s2__ign: The string to search for * @len: the maximum number of characters to search * * Return: * * >=0 - Index of the first character of the first occurrence of @s2__ign * within the first @len characters of @s1__ign * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign * * %-EFAULT - Cannot read one of the strings * * %-E2BIG - One of the strings is too large * * %-ERANGE - One of the strings is outside of kernel address space */ __bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign, size_t len) { return __bpf_strnstr(s1__ign, s2__ign, len, true); } #ifdef CONFIG_KEYS /** * bpf_lookup_user_key - lookup a key by its serial * @serial: key handle serial number * @flags: lookup-specific flags * * Search a key with a given *serial* and the provided *flags*. * If found, increment the reference count of the key by one, and * return it in the bpf_key structure. * * The bpf_key structure must be passed to bpf_key_put() when done * with it, so that the key reference count is decremented and the * bpf_key structure is freed. * * Permission checks are deferred to the time the key is used by * one of the available key-specific kfuncs. * * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested * special keyring (e.g. session keyring), if it doesn't yet exist. * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting * for the key construction, and to retrieve uninstantiated keys (keys * without data attached to them). * * Return: a bpf_key pointer with a valid key pointer if the key is found, a * NULL pointer otherwise. */ __bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) { key_ref_t key_ref; struct bpf_key *bkey; if (flags & ~KEY_LOOKUP_ALL) return NULL; /* * Permission check is deferred until the key is used, as the * intent of the caller is unknown here. */ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); if (IS_ERR(key_ref)) return NULL; bkey = kmalloc_obj(*bkey); if (!bkey) { key_put(key_ref_to_ptr(key_ref)); return NULL; } bkey->key = key_ref_to_ptr(key_ref); bkey->has_ref = true; return bkey; } /** * bpf_lookup_system_key - lookup a key by a system-defined ID * @id: key ID * * Obtain a bpf_key structure with a key pointer set to the passed key ID. * The key pointer is marked as invalid, to prevent bpf_key_put() from * attempting to decrement the key reference count on that pointer. The key * pointer set in such way is currently understood only by * verify_pkcs7_signature(). * * Set *id* to one of the values defined in include/linux/verification.h: * 0 for the primary keyring (immutable keyring of system keys); * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring * (where keys can be added only if they are vouched for by existing keys * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform * keyring (primarily used by the integrity subsystem to verify a kexec'ed * kerned image and, possibly, the initramfs signature). * * Return: a bpf_key pointer with an invalid key pointer set from the * pre-determined ID on success, a NULL pointer otherwise */ __bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) { struct bpf_key *bkey; if (system_keyring_id_check(id) < 0) return NULL; bkey = kmalloc_obj(*bkey, GFP_ATOMIC); if (!bkey) return NULL; bkey->key = (struct key *)(unsigned long)id; bkey->has_ref = false; return bkey; } /** * bpf_key_put - decrement key reference count if key is valid and free bpf_key * @bkey: bpf_key structure * * Decrement the reference count of the key inside *bkey*, if the pointer * is valid, and free *bkey*. */ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) { if (bkey->has_ref) key_put(bkey->key); kfree(bkey); } /** * bpf_verify_pkcs7_signature - verify a PKCS#7 signature * @data_p: data to verify * @sig_p: signature of the data * @trusted_keyring: keyring with keys trusted for signature verification * * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* * with keys in a keyring referenced by *trusted_keyring*. * * Return: 0 on success, a negative value on error. */ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { /* * Do the permission check deferred in bpf_lookup_user_key(). * See bpf_lookup_user_key() for more details. * * A call to key_task_permission() here would be redundant, as * it is already done by keyring_search() called by * find_asymmetric_key(). */ ret = key_validate(trusted_keyring->key); if (ret < 0) return ret; } data_len = __bpf_dynptr_size(data_ptr); data = __bpf_dynptr_data(data_ptr, data_len); sig_len = __bpf_dynptr_size(sig_ptr); sig = __bpf_dynptr_data(sig_ptr, sig_len); return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ } #endif /* CONFIG_KEYS */ typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); enum bpf_task_work_state { /* bpf_task_work is ready to be used */ BPF_TW_STANDBY = 0, /* irq work scheduling in progress */ BPF_TW_PENDING, /* task work scheduling in progress */ BPF_TW_SCHEDULING, /* task work is scheduled successfully */ BPF_TW_SCHEDULED, /* callback is running */ BPF_TW_RUNNING, /* associated BPF map value is deleted */ BPF_TW_FREED, }; struct bpf_task_work_ctx { enum bpf_task_work_state state; refcount_t refcnt; struct callback_head work; struct irq_work irq_work; /* bpf_prog that schedules task work */ struct bpf_prog *prog; /* task for which callback is scheduled */ struct task_struct *task; /* the map and map value associated with this context */ struct bpf_map *map; void *map_val; enum task_work_notify_mode mode; bpf_task_work_callback_t callback_fn; struct rcu_head rcu; } __aligned(8); /* Actual type for struct bpf_task_work */ struct bpf_task_work_kern { struct bpf_task_work_ctx *ctx; }; static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) { if (ctx->prog) { bpf_prog_put(ctx->prog); ctx->prog = NULL; } if (ctx->task) { bpf_task_release(ctx->task); ctx->task = NULL; } } static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) { return refcount_inc_not_zero(&ctx->refcnt); } static void bpf_task_work_destroy(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); bpf_task_work_ctx_reset(ctx); kfree_rcu(ctx, rcu); } static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) { if (!refcount_dec_and_test(&ctx->refcnt)) return; if (irqs_disabled()) { ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy); irq_work_queue(&ctx->irq_work); } else { bpf_task_work_destroy(&ctx->irq_work); } } static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) { /* * Scheduled task_work callback holds ctx ref, so if we successfully * cancelled, we put that ref on callback's behalf. If we couldn't * cancel, callback will inevitably run or has already completed * running, and it would have taken care of its ctx ref itself. */ if (task_work_cancel(ctx->task, &ctx->work)) bpf_task_work_ctx_put(ctx); } static void bpf_task_work_callback(struct callback_head *cb) { struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); enum bpf_task_work_state state; u32 idx; void *key; /* Read lock is needed to protect ctx and map key/value access */ guard(rcu_tasks_trace)(); /* * This callback may start running before bpf_task_work_irq() switched to * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); if (state == BPF_TW_SCHEDULED) state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); if (state == BPF_TW_FREED) { bpf_task_work_ctx_put(ctx); return; } key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); migrate_disable(); ctx->callback_fn(ctx->map, key, ctx->map_val); migrate_enable(); bpf_task_work_ctx_reset(ctx); (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); bpf_task_work_ctx_put(ctx); } static void bpf_task_work_irq(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); enum bpf_task_work_state state; int err; guard(rcu)(); if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { bpf_task_work_ctx_put(ctx); return; } err = task_work_add(ctx->task, &ctx->work, ctx->mode); if (err) { bpf_task_work_ctx_reset(ctx); /* * try to switch back to STANDBY for another task_work reuse, but we might have * gone to FREED already, which is fine as we already cleaned up after ourselves */ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); bpf_task_work_ctx_put(ctx); return; } /* * It's technically possible for just scheduled task_work callback to * complete running by now, going SCHEDULING -> RUNNING and then * dropping its ctx refcount. Instead of capturing an extra ref just * to protect below ctx->state access, we rely on rcu_read_lock * above to prevent kfree_rcu from freeing ctx before we return. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); if (state == BPF_TW_FREED) bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ } static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, struct bpf_map *map) { struct bpf_task_work_kern *twk = (void *)tw; struct bpf_task_work_ctx *ctx, *old_ctx; ctx = READ_ONCE(twk->ctx); if (ctx) return ctx; ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE); if (!ctx) return ERR_PTR(-ENOMEM); memset(ctx, 0, sizeof(*ctx)); refcount_set(&ctx->refcnt, 1); /* map's own ref */ ctx->state = BPF_TW_STANDBY; old_ctx = cmpxchg(&twk->ctx, NULL, ctx); if (old_ctx) { /* * tw->ctx is set by concurrent BPF program, release allocated * memory and try to reuse already set context. */ kfree_nolock(ctx); return old_ctx; } return ctx; /* Success */ } static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, struct bpf_map *map) { struct bpf_task_work_ctx *ctx; /* * Sleepable BPF programs hold rcu_read_lock_trace but not * regular rcu_read_lock. Since kfree_rcu waits for regular * RCU GP, the ctx can be freed while we're between reading * the pointer and incrementing the refcount. Take regular * rcu_read_lock to prevent kfree_rcu from freeing the ctx * before we can tryget it. */ scoped_guard(rcu) { ctx = bpf_task_work_fetch_ctx(tw, map); if (IS_ERR(ctx)) return ctx; /* try to get ref for task_work callback to hold */ if (!bpf_task_work_ctx_tryget(ctx)) return ERR_PTR(-EBUSY); } if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ bpf_task_work_ctx_put(ctx); return ERR_PTR(-EBUSY); } /* * If no process or bpffs is holding a reference to the map, no new callbacks should be * scheduled. This does not address any race or correctness issue, but rather is a policy * choice: dropping user references should stop everything. */ if (!atomic64_read(&map->usercnt)) { /* drop ref we just got for task_work callback itself */ bpf_task_work_ctx_put(ctx); /* transfer map's ref into cancel_and_free() */ bpf_task_work_cancel_and_free(tw); return ERR_PTR(-EBUSY); } return ctx; } static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, struct bpf_map *map, bpf_task_work_callback_t callback_fn, struct bpf_prog_aux *aux, enum task_work_notify_mode mode) { struct bpf_prog *prog; struct bpf_task_work_ctx *ctx; int err; BTF_TYPE_EMIT(struct bpf_task_work); prog = bpf_prog_inc_not_zero(aux->prog); if (IS_ERR(prog)) return -EBADF; task = bpf_task_acquire(task); if (!task) { err = -EBADF; goto release_prog; } ctx = bpf_task_work_acquire_ctx(tw, map); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto release_all; } ctx->task = task; ctx->callback_fn = callback_fn; ctx->prog = prog; ctx->mode = mode; ctx->map = map; ctx->map_val = (void *)tw - map->record->task_work_off; init_task_work(&ctx->work, bpf_task_work_callback); init_irq_work(&ctx->irq_work, bpf_task_work_irq); irq_work_queue(&ctx->irq_work); return 0; release_all: bpf_task_release(task); release_prog: bpf_prog_put(prog); return err; } /** * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, void *map__map, bpf_task_work_callback_t callback, struct bpf_prog_aux *aux) { return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL); } /** * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, void *map__map, bpf_task_work_callback_t callback, struct bpf_prog_aux *aux) { return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, struct bpf_dynptr_kern *ptr) { struct bpf_dynptr_file_impl *state; /* flags is currently unsupported */ if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE); if (!state) { bpf_dynptr_set_null(ptr); return -ENOMEM; } state->offset = 0; state->size = U64_MAX; /* Don't restrict size, as file may change anyways */ freader_init_from_file(&state->freader, NULL, 0, file, may_sleep); bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0); bpf_dynptr_set_rdonly(ptr); return 0; } __bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) { return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit); } int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit) { return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit); } __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr; struct bpf_dynptr_file_impl *df = ptr->data; if (!df) return 0; freader_cleanup(&df->freader); kfree_nolock(df); bpf_dynptr_set_null(ptr); return 0; } /** * bpf_timer_cancel_async - try to deactivate a timer * @timer: bpf_timer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped * * -ECANCELED when the timer will be cancelled asynchronously * * -ENOMEM when out of memory * * -EINVAL when the timer was not initialized * * -ENOENT when this kfunc is racing with timer deletion */ __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) { struct bpf_async_kern *async = (void *)timer; struct bpf_async_cb *cb; int ret; cb = READ_ONCE(async->cb); if (!cb) return -EINVAL; /* * Unlike hrtimer_start() it's ok to synchronously call * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to * irq_work is not, since irq callback may execute after RCU GP and * cb could be freed at that time. Check for refcnt zero for * consistency. */ if (!refcount_inc_not_zero(&cb->refcnt)) return -ENOENT; if (!defer_timer_wq_op()) { struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); ret = hrtimer_try_to_cancel(&t->timer); bpf_async_refcount_put(cb); return ret; } else { ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); return ret ? ret : -ECANCELED; } } __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) { struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ } void bpf_task_work_cancel_and_free(void *val) { struct bpf_task_work_kern *twk = val; struct bpf_task_work_ctx *ctx; enum bpf_task_work_state state; ctx = xchg(&twk->ctx, NULL); if (!ctx) return; state = xchg(&ctx->state, BPF_TW_FREED); if (state == BPF_TW_SCHEDULED) { /* run in irq_work to avoid locks in NMI */ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); irq_work_queue(&ctx->irq_work); return; } bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ } BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif #endif #ifdef CONFIG_S390 BTF_ID_FLAGS(func, bpf_get_lowcore) #endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, .set = &generic_btf_ids, }; BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_KFUNCS_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); BTF_ID_FLAGS(func, bpf_strcasecmp); BTF_ID_FLAGS(func, bpf_strncasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); BTF_ID_FLAGS(func, bpf_strrchr); BTF_ID_FLAGS(func, bpf_strlen); BTF_ID_FLAGS(func, bpf_strnlen); BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); BTF_ID_FLAGS(func, bpf_strcasestr); BTF_ID_FLAGS(func, bpf_strnstr); BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, .set = &common_btf_ids, }; static int __init kfunc_init(void) { int ret; const struct btf_id_dtor_kfunc generic_dtors[] = { { .btf_id = generic_dtor_ids[0], .kfunc_btf_id = generic_dtor_ids[1] }, #ifdef CONFIG_CGROUPS { .btf_id = generic_dtor_ids[2], .kfunc_btf_id = generic_dtor_ids[3] }, #endif }; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; return bpf_dynptr_slice(p, 0, NULL, len); } /* Get a pointer to dynptr data up to len bytes for read write access. If * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; return (void *)__bpf_dynptr_data(ptr, len); } void bpf_map_free_internal_structs(struct bpf_map *map, void *val) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, val); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, val); if (btf_record_has_field(map->record, BPF_TASK_WORK)) bpf_obj_free_task_work(map->record, val); }
11 1 10 2 1 5 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> #include <linux/icmp.h> #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); if (priv->type == NFT_REJECT_ICMPX_UNREACH && icmp_code > NFT_REJECT_ICMPX_MAX) return -EINVAL; priv->icmp_code = icmp_code; break; case NFT_REJECT_TCP_RST: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(nft_reject_init); int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_reject_dump); static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, }; int nft_reject_icmp_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMP_NET_UNREACH; return icmp_code_v4[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmp_code); static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, }; int nft_reject_icmpv6_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMPV6_NOROUTE; return icmp_code_v6[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include "mballoc.h" #include <trace/events/ext4.h> #include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ /* * Calculate block group number for a given block number */ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block) { ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) group = (block - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); return group; } /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) *blockgrpp = blocknr; } /* * Check whether the 'block' lives within the 'block_group'. Returns 1 if so * and 0 otherwise. */ static inline int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } /* * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned base_clusters, num_clusters; int block_cluster = -1, inode_cluster; int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ base_clusters = ext4_num_base_meta_clusters(sb, block_group); num_clusters = base_clusters; /* * Account and record inode table clusters if any cluster * is in the block group, or inode table cluster range is * [-1, -1] and won't overlap with block/inode bitmap cluster * accounted below. */ itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { itbl_blk_start = max(itbl_blk_start, start); itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); num_clusters += itbl_cluster_end - itbl_cluster_start + 1; /* check if border cluster is overlapped */ if (itbl_cluster_start == base_clusters - 1) num_clusters--; } /* * For the allocation bitmaps, we first need to check to see * if the block is in the block group. If it is, then check * to see if the cluster is already accounted for in the clusters * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); if (block_cluster >= base_clusters && (block_cluster < itbl_cluster_start || block_cluster > itbl_cluster_end)) num_clusters++; } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); /* * Additional check if inode bitmap is in just accounted * block_cluster */ if (inode_cluster != block_cluster && inode_cluster >= base_clusters && (inode_cluster < itbl_cluster_start || inode_cluster > itbl_cluster_end)) num_clusters++; } return num_clusters; } static unsigned int num_clusters_in_group(struct super_block *sb, ext4_group_t block_group) { unsigned int blocks; if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and * last group, just in case some other tool was used, * we need to make sure we calculate the right free * blocks. */ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else blocks = EXT4_BLOCKS_PER_GROUP(sb); return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; ASSERT(buffer_locked(bh)); if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); if ((bit_max >> 3) >= bh->b_size) return -EFSCORRUPTED; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); return 0; } /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } /* * The free blocks are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. The descriptors are loaded in memory * when a file system is mounted (see ext4_fill_super). */ /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block * @block_group: given block group * @bh: pointer to the buffer head to store the block * group descriptor */ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { unsigned int group_desc; unsigned int offset; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, sb, block_group, bh); if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); return NULL; } group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); /* * sbi_array_rcu_deref returns with rcu unlocked, this is ok since * the pointer being dereferenced won't be dereferenced again. By * looking at the usage in add_new_gdb() the value isn't modified, * just the pointer, and so it remains valid. */ if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } desc = (struct ext4_group_desc *)( (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) *bh = bh_p; return desc; } static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t next_zero_bit; unsigned long bitmap_size = sb->s_blocksize * 8; unsigned int offset = num_clusters_in_group(sb, block_group); if (bitmap_size <= offset) return 0; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); return (next_zero_bit < bitmap_size ? next_zero_bit : 0); } struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group) { struct ext4_group_info **grp_info; long indexv, indexh; if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) return NULL; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); return grp_info[indexh]; } /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups * or it has to also read the block group where the bitmaps * are located to verify they are set. */ return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; } static int ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /** * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @ignore_locked: ignore locked buffers * * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, sb, block_group, ignore_locked); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (ignore_locked && buffer_locked(bh)) { /* buffer under IO already, return if called for prefetching */ put_bh(bh); return NULL; } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Block bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); if (err) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), ext4_end_bitmap_read, ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* Returns 0 on success, -errno on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_group_desc *desc; KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, sb, block_group, bh); if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; int err; bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); if (err) { put_bh(bh); return ERR_PTR(err); } return bh; } /** * ext4_has_free_clusters() * @sbi: in-core super block structure. * @nclusters: number of needed blocks * @flags: flags from ext4_mb_new_blocks() * * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + resv_clusters; if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || (flags & EXT4_MB_USE_ROOT_BLOCKS) || capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) return 1; } /* No free blocks. Let's see if we can dip into reserved pool */ if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; } /** * ext4_should_retry_alloc() - check if a block allocation should be retried * @sb: superblock * @retries: number of retry attempts made so far * * ext4_should_retry_alloc() is called when ENOSPC is returned while * attempting to allocate blocks. If there's an indication that a pending * journal transaction might free some space and allow another attempt to * succeed, this function will wait for the current or committing transaction * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (!sbi->s_journal) return 0; if (++(*retries) > 3) { percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; } /* * if there's no indication that blocks are about to be freed it's * possible we just missed a transaction commit that did so */ smp_mb(); if (atomic_read(&sbi->s_mb_free_pending) == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); atomic_dec(&sbi->s_retry_alloc_pending); } return ext4_has_free_clusters(sbi, 1, 0); } /* * it's possible we've just missed a transaction commit here, * so ignore the returned status */ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; /* * Account for the allocated meta blocks. We will never * fail EDQUOT for metadata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * * Adds up the number of free clusters from each block group. */ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; #endif } static inline int test_root(ext4_group_t a, int b) { while (1) { if (a < b) return 0; if (a == b) return 1; if ((a % b) != 0) return 0; a = a / b; } } /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the superblock (primary or backup) * in this group. Currently this will be only 0 or 1. */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (group == 0) return 1; if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; if (test_root(group, 3) || (test_root(group, 5)) || test_root(group, 7)) return 1; return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, ext4_group_t group) { unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; if (group == first || group == first + 1 || group == last) return 1; return 0; } static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { if (!ext4_bg_has_super(sb, group)) return 0; if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; } /** * ext4_bg_num_gdb - number of blocks used by the group table in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the group descriptor table * (primary or backup) in this group. In the future there may be a * different number of descriptor blocks in each group. */ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) { unsigned long first_meta_bg = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } return num; } static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); } /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation * * Return the ideal location to start allocating blocks for a * newly created inode. */ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_group_t block_group; ext4_grpblk_t colour; int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); ext4_fsblk_t bg_start; ext4_fsblk_t last_block; block_group = ei->i_block_group; if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME * block groups per flexgroup, reserve the first block * group for directories and special files. Regular * files will start at the second block group. This * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); if (S_ISREG(inode->i_mode)) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* * If we are doing delayed allocation, we don't need take * colour into account. */ if (test_opt(inode->i_sb, DELALLOC)) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else colour = (task_pid_nr(current) % 16) * ((last_block - bg_start) / 16); return bg_start + colour; }
96 292 1647 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMEKEEPING_H #define _LINUX_TIMEKEEPING_H #include <linux/errno.h> #include <linux/clocksource_ids.h> #include <linux/ktime.h> /* Included from linux/ktime.h */ void timekeeping_init(void); extern int timekeeping_suspended; /* Architecture timer tick functions: */ extern void legacy_timer_tick(unsigned long ticks); /* * Get and set timeofday */ extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. */ /* * timespec64 based interfaces */ extern void ktime_get_raw_ts64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); extern void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); /* * time64_t base interfaces */ extern time64_t ktime_get_seconds(void); extern time64_t __ktime_get_real_seconds(void); extern time64_t ktime_get_real_seconds(void); /* * ktime_t based interfaces */ enum tk_offsets { TK_OFFS_REAL, TK_OFFS_BOOT, TK_OFFS_TAI, TK_OFFS_MAX, }; extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format * * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { return ktime_get_with_offset(TK_OFFS_REAL); } static inline ktime_t ktime_get_coarse_real(void) { return ktime_get_coarse_with_offset(TK_OFFS_REAL); } /** * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. * * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { return ktime_get_with_offset(TK_OFFS_BOOT); } static inline ktime_t ktime_get_coarse_boottime(void) { return ktime_get_coarse_with_offset(TK_OFFS_BOOT); } /** * ktime_get_clocktai - Get the TAI time of day in ktime_t format * * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { return ktime_get_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse_clocktai(void) { return ktime_get_coarse_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse(void) { struct timespec64 ts; ktime_get_coarse_ts64(&ts); return timespec64_to_ktime(ts); } static inline u64 ktime_get_coarse_ns(void) { return ktime_to_ns(ktime_get_coarse()); } static inline u64 ktime_get_coarse_real_ns(void) { return ktime_to_ns(ktime_get_coarse_real()); } static inline u64 ktime_get_coarse_boottime_ns(void) { return ktime_to_ns(ktime_get_coarse_boottime()); } static inline u64 ktime_get_coarse_clocktai_ns(void) { return ktime_to_ns(ktime_get_coarse_clocktai()); } /** * ktime_mono_to_real - Convert monotonic time to clock realtime * @mono: monotonic time to convert * * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } /** * ktime_get_ns - Get the current time in nanoseconds * * Returns: current time converted to nanoseconds */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } /** * ktime_get_real_ns - Get the current real/wall time in nanoseconds * * Returns: current real time converted to nanoseconds */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } /** * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds * * Returns: current boottime converted to nanoseconds */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } /** * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds * * Returns: current TAI time converted to nanoseconds */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } /** * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds * * Returns: current raw monotonic time converted to nanoseconds */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); } extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); extern u64 ktime_get_tai_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); /* * timespec64/time64_t interfaces utilizing the ktime based ones * for API completeness, these could be implemented more efficiently * if needed. */ static inline void ktime_get_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_boottime()); } static inline time64_t ktime_get_boottime_seconds(void) { return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC); } static inline void ktime_get_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); } static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_clocktai()); } static inline time64_t ktime_get_clocktai_seconds(void) { return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC); } /* * RTC specific */ extern bool timekeeping_rtc_skipsuspend(void); extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); /* * Auxiliary clock interfaces */ #ifdef CONFIG_POSIX_AUX_CLOCKS extern bool ktime_get_aux(clockid_t id, ktime_t *kt); extern bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt); #else static inline bool ktime_get_aux(clockid_t id, ktime_t *kt) { return false; } static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; } #endif /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { u64 cycles; ktime_t real; ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; }; /** * struct system_device_crosststamp - system/device cross-timestamp * (synchronized capture) * @device: Device time * @sys_realtime: Realtime simultaneous with device time * @sys_monoraw: Monotonic raw simultaneous with device time */ struct system_device_crosststamp { ktime_t device; ktime_t sys_realtime; ktime_t sys_monoraw; }; /** * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource * @cycles: System counter value * @cs_id: Clocksource ID corresponding to system counter value. Used by * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. * @use_nsecs: @cycles is in nanoseconds. */ struct system_counterval_t { u64 cycles; enum clocksource_ids cs_id; bool use_nsecs; }; extern bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles); extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); /* * Get cross timestamp between system clock and device clock */ extern int get_device_system_crosststamp( int (*get_time_fn)(ktime_t *device_time, struct system_counterval_t *system_counterval, void *ctx), void *ctx, struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* * Simultaneously snapshot realtime and monotonic raw clocks */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); /* * Persistent clock related interfaces */ extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, struct timespec64 *boot_offset); #ifdef CONFIG_GENERIC_CMOS_UPDATE extern int update_persistent_clock64(struct timespec64 now); #endif #endif
4 2 2 74 74 4 4 4 4 4 27 27 27 1 27 27 33 33 1 4 4 2 8 3 12 27 27 13 4 21 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); } static int xfrmi_create(struct net *net, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; err = gro_cells_init(&xi->gro_cells, dev); if (err) return err; dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct xfrm_if_parms p = {}; struct xfrm_if *xi; struct net *net; int err; net = params->link_net ? : dev_net(dev); xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(net, dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md || xi->p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_net_dereference(net, *xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface");
14286 2092 14276 2 4035 12857 1577 4035 13433 10715 13419 13424 13470 13422 10714 10711 10745 10717 10702 10714 10743 13426 13470 561 560 564 558 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/memblock.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> #include <linux/jump_label.h> #include <linux/migrate.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/sched/clock.h> #include "internal.h" /* * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) * to use off stack temporal storage */ #define PAGE_OWNER_STACK_DEPTH (16) struct page_owner { unsigned short order; short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; char comm[TASK_COMM_LEN]; pid_t pid; pid_t tgid; pid_t free_pid; pid_t free_tgid; }; struct stack { struct stack_record *stack_record; struct stack *next; }; static struct stack dummy_stack; static struct stack failure_stack; static struct stack *stack_list; static DEFINE_SPINLOCK(stack_list_lock); #define STACK_PRINT_FLAG_STACK 0x1 #define STACK_PRINT_FLAG_PAGES 0x2 #define STACK_PRINT_FLAG_HANDLE 0x4 struct stack_print_ctx { struct stack *stack; u8 flags; }; static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; static depot_stack_handle_t failure_handle; static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); static inline void set_current_in_page_owner(void) { /* * Avoid recursion. * * We might need to allocate more memory from page_owner code, so make * sure to signal it in order to avoid recursion. */ current->in_page_owner = 1; } static inline void unset_current_in_page_owner(void) { current->in_page_owner = 0; } static int __init early_page_owner_param(char *buf) { int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) stack_depot_request_early_init(); return ret; } early_param("page_owner", early_page_owner_param); static __init bool need_page_owner(void) { return page_owner_enabled; } static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); return stack_depot_save(entries, nr_entries, GFP_KERNEL); } static noinline void register_dummy_stack(void) { dummy_handle = create_dummy_stack(); } static noinline void register_failure_stack(void) { failure_handle = create_dummy_stack(); } static noinline void register_early_stack(void) { early_handle = create_dummy_stack(); } static __init void init_page_owner(void) { if (!page_owner_enabled) return; register_dummy_stack(); register_failure_stack(); register_early_stack(); init_early_allocated_pages(); /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle); if (dummy_stack.stack_record) refcount_set(&dummy_stack.stack_record->count, 1); if (failure_stack.stack_record) refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, .need_shared_flags = true, }; static inline struct page_owner *get_page_owner(struct page_ext *page_ext) { return page_ext_data(page_ext, &page_owner_ops); } static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; if (current->in_page_owner) return dummy_handle; set_current_in_page_owner(); nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; unset_current_in_page_owner(); return handle; } static void add_stack_record_to_list(struct stack_record *stack_record, gfp_t gfp_mask) { unsigned long flags; struct stack *stack; if (!gfpflags_allow_spinning(gfp_mask)) return; set_current_in_page_owner(); stack = kmalloc_obj(*stack, gfp_nested_mask(gfp_mask)); if (!stack) { unset_current_in_page_owner(); return; } unset_current_in_page_owner(); stack->stack_record = stack_record; stack->next = NULL; spin_lock_irqsave(&stack_list_lock, flags); stack->next = stack_list; /* * This pairs with smp_load_acquire() from function * stack_start(). This guarantees that stack_start() * will see an updated stack_list before starting to * traverse the list. */ smp_store_release(&stack_list, stack); spin_unlock_irqrestore(&stack_list_lock, flags); } static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); if (!stack_record) return; /* * New stack_record's that do not use STACK_DEPOT_FLAG_GET start * with REFCOUNT_SATURATED to catch spurious increments of their * refcount. * Since we do not use STACK_DEPOT_FLAG_GET API, let us * set a refcount of 1 ourselves. */ if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) { int old = REFCOUNT_SATURATED; if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1)) /* Add the new stack_record to our list */ add_stack_record_to_list(stack_record, gfp_mask); } refcount_add(nr_base_pages, &stack_record->count); } static void dec_stack_record_count(depot_stack_handle_t handle, int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); if (!stack_record) return; if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) pr_warn("%s: refcount went to 0 for %u handle\n", __func__, handle); } static inline void __update_page_owner_handle(struct page *page, depot_stack_handle_t handle, unsigned short order, gfp_t gfp_mask, short last_migrate_reason, u64 ts_nsec, pid_t pid, pid_t tgid, char *comm) { struct page_ext_iter iter; struct page_ext *page_ext; struct page_owner *page_owner; rcu_read_lock(); for_each_page_ext(page, 1 << order, page_ext, iter) { page_owner = get_page_owner(page_ext); page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = last_migrate_reason; page_owner->pid = pid; page_owner->tgid = tgid; page_owner->ts_nsec = ts_nsec; strscpy(page_owner->comm, comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); } rcu_read_unlock(); } static inline void __update_page_owner_free_handle(struct page *page, depot_stack_handle_t handle, unsigned short order, pid_t pid, pid_t tgid, u64 free_ts_nsec) { struct page_ext_iter iter; struct page_ext *page_ext; struct page_owner *page_owner; rcu_read_lock(); for_each_page_ext(page, 1 << order, page_ext, iter) { page_owner = get_page_owner(page_ext); /* Only __reset_page_owner() wants to clear the bit */ if (handle) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner->free_handle = handle; } page_owner->free_ts_nsec = free_ts_nsec; page_owner->free_pid = current->pid; page_owner->free_tgid = current->tgid; } rcu_read_unlock(); } void __reset_page_owner(struct page *page, unsigned short order) { struct page_ext *page_ext; depot_stack_handle_t handle; depot_stack_handle_t alloc_handle; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; page_owner = get_page_owner(page_ext); alloc_handle = page_owner->handle; page_ext_put(page_ext); /* * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false * to prevent issues in stack_depot_save(). * This is similar to alloc_pages_nolock() gfp flags, but only used * to signal stack_depot to avoid spin_locks. */ handle = save_stack(__GFP_NOWARN); __update_page_owner_free_handle(page, handle, order, current->pid, current->tgid, free_ts_nsec); if (alloc_handle != early_handle) /* * early_handle is being set as a handle for all those * early allocated pages. See init_pages_in_zone(). * Since their refcount is not being incremented because * the machinery is not ready yet, we cannot decrement * their refcount either. */ dec_stack_record_count(alloc_handle, 1 << order); } noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); __update_page_owner_handle(page, handle, order, gfp_mask, -1, ts_nsec, current->pid, current->tgid, current->comm); inc_stack_record_count(handle, gfp_mask, 1 << order); } void __folio_set_owner_migrate_reason(struct folio *folio, int reason) { struct page_ext *page_ext = page_ext_get(&folio->page); struct page_owner *page_owner; if (unlikely(!page_ext)) return; page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; page_ext_put(page_ext); } void __split_page_owner(struct page *page, int old_order, int new_order) { struct page_ext_iter iter; struct page_ext *page_ext; struct page_owner *page_owner; rcu_read_lock(); for_each_page_ext(page, 1 << old_order, page_ext, iter) { page_owner = get_page_owner(page_ext); page_owner->order = new_order; } rcu_read_unlock(); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { struct page_ext *page_ext; struct page_ext_iter iter; struct page_owner *old_page_owner; struct page_owner *new_page_owner; depot_stack_handle_t migrate_handle; page_ext = page_ext_get(&old->page); if (unlikely(!page_ext)) return; old_page_owner = get_page_owner(page_ext); page_ext_put(page_ext); page_ext = page_ext_get(&newfolio->page); if (unlikely(!page_ext)) return; new_page_owner = get_page_owner(page_ext); page_ext_put(page_ext); migrate_handle = new_page_owner->handle; __update_page_owner_handle(&newfolio->page, old_page_owner->handle, old_page_owner->order, old_page_owner->gfp_mask, old_page_owner->last_migrate_reason, old_page_owner->ts_nsec, old_page_owner->pid, old_page_owner->tgid, old_page_owner->comm); /* * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio * will be freed after migration. Keep them until then as they may be * useful. */ __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order, old_page_owner->free_pid, old_page_owner->free_tgid, old_page_owner->free_ts_nsec); /* * We linked the original stack to the new folio, we need to do the same * for the new one and the old folio otherwise there will be an imbalance * when subtracting those pages from the stack. */ rcu_read_lock(); for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) { old_page_owner = get_page_owner(page_ext); old_page_owner->handle = migrate_handle; } rcu_read_unlock(); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; unsigned long pfn, block_end_pfn; unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; int pageblock_mt, page_mt; int i; /* Scan block by block. First and last block may be incomplete */ pfn = zone->zone_start_pfn; /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { page = pfn_to_online_page(pfn); if (!page) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); if (page_zone(page) != zone) continue; if (PageBuddy(page)) { unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } if (PageReserved(page)) continue; page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; else count[pageblock_mt]++; pfn = block_end_pfn; page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; ext_put_continue: page_ext_put(page_ext); } } /* Print counts */ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); for (i = 0; i < MIGRATE_TYPES; i++) seq_printf(m, "%12lu ", count[i]); seq_putc(m, '\n'); } /* * Looking for memcg information and print it out */ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, struct page *page) { #ifdef CONFIG_MEMCG unsigned long memcg_data; struct mem_cgroup *memcg; bool online; char name[80]; rcu_read_lock(); memcg_data = READ_ONCE(page->memcg_data); if (!memcg_data || PageTail(page)) goto out_unlock; if (memcg_data & MEMCG_DATA_OBJEXTS) ret += scnprintf(kbuf + ret, count - ret, "Slab cache page\n"); memcg = page_memcg_check(page); if (!memcg) goto out_unlock; online = css_is_online(&memcg->css); cgroup_name(memcg->css.cgroup, name, sizeof(name)); ret += scnprintf(kbuf + ret, count - ret, "Charged %sto %smemcg %s\n", PageMemcgKmem(page) ? "(via objcg) " : "", online ? "" : "offline ", name); out_unlock: rcu_read_unlock(); #endif /* CONFIG_MEMCG */ return ret; } static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { int ret, pageblock_mt, page_mt; char *kbuf; count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; ret = scnprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, page_owner->ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); ret += scnprintf(kbuf + ret, count - ret, "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], pfn >> pageblock_order, migratetype_names[pageblock_mt], &page->flags); ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); } ret = print_page_owner_memcg(kbuf, count, ret, page); ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; if (copy_to_user(buf, kbuf, ret)) ret = -EFAULT; kfree(kbuf); return ret; err: kfree(kbuf); return -ENOMEM; } void __dump_page_owner(const struct page *page) { struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; int mt; if (unlikely(!page_ext)) { pr_alert("There is not page extension available.\n"); return; } page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); page_ext_put(page_ext); return; } if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) pr_alert("page_owner tracks the page as allocated\n"); else pr_alert("page_owner tracks the page as freed\n"); pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) pr_alert("page_owner allocation stack trace missing\n"); else stack_depot_print(handle); handle = READ_ONCE(page_owner->free_handle); if (!handle) { pr_alert("page_owner free stack trace missing\n"); } else { pr_alert("page last free pid %d tgid %d stack trace:\n", page_owner->free_pid, page_owner->free_tgid); stack_depot_print(handle); } if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); page_ext_put(page_ext); } static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long pfn; struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; if (*ppos == 0) pfn = min_low_pfn; else pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* * This temporary page_owner is required so * that we can avoid the context switches while holding * the rcu lock and copying the page owner information to * user through copy_to_user() or GFP_KERNEL allocations. */ struct page_owner page_owner_tmp; /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not */ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { pfn += MAX_ORDER_NR_PAGES - 1; continue; } page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* * Some pages could be missed by concurrent allocation or free, * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); /* * Don't print "tail" pages of high-order allocations as that * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. */ handle = READ_ONCE(page_owner->handle); if (!handle) goto ext_put_continue; /* Record the next PFN to read in the file offset */ *ppos = pfn + 1; page_owner_tmp = *page_owner; page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, &page_owner_tmp, handle); ext_put_continue: page_ext_put(page_ext); } return 0; } static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) { switch (orig) { case SEEK_SET: file->f_pos = offset; break; case SEEK_CUR: file->f_pos += offset; break; default: return -EINVAL; } return file->f_pos; } static void init_pages_in_zone(struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { unsigned long block_end_pfn; if (!pfn_valid(pfn)) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); struct page_ext *page_ext; if (page_zone(page) != zone) continue; /* * To avoid having to grab zone->lock, be a little * careful when reading buddy page order. The only * danger is that we skip too much and potentially miss * some early allocated pages, which is better than * heavy lock contention. */ if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); if (order > 0 && order <= MAX_PAGE_ORDER) pfn += (1UL << order) - 1; continue; } if (PageReserved(page)) continue; page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* Found early allocated page */ __update_page_owner_handle(page, early_handle, 0, 0, -1, local_clock(), current->pid, current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); } cond_resched(); } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", zone->zone_pgdat->node_id, zone->name, count); } static void init_early_allocated_pages(void) { struct zone *zone; for_each_populated_zone(zone) init_pages_in_zone(zone); } static const struct file_operations page_owner_fops = { .read = read_page_owner, .llseek = lseek_page_owner, }; static void *stack_start(struct seq_file *m, loff_t *ppos) { struct stack *stack; struct stack_print_ctx *ctx = m->private; if (*ppos == -1UL) return NULL; if (!*ppos) { /* * This pairs with smp_store_release() from function * add_stack_record_to_list(), so we get a consistent * value of stack_list. */ stack = smp_load_acquire(&stack_list); ctx->stack = stack; } else { stack = ctx->stack; } return stack; } static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) { struct stack *stack = v; struct stack_print_ctx *ctx = m->private; stack = stack->next; *ppos = stack ? *ppos + 1 : -1UL; ctx->stack = stack; return stack; } static unsigned long page_owner_pages_threshold; static int stack_print(struct seq_file *m, void *v) { int i, nr_base_pages; struct stack *stack = v; unsigned long *entries; unsigned long nr_entries; struct stack_record *stack_record = stack->stack_record; struct stack_print_ctx *ctx = m->private; if (!stack->stack_record) return 0; nr_base_pages = refcount_read(&stack_record->count) - 1; if (ctx->flags & STACK_PRINT_FLAG_PAGES && (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)) return 0; if (ctx->flags & STACK_PRINT_FLAG_STACK) { nr_entries = stack_record->size; entries = stack_record->entries; for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); } if (ctx->flags & STACK_PRINT_FLAG_HANDLE) seq_printf(m, "handle: %d\n", stack_record->handle.handle); if (ctx->flags & STACK_PRINT_FLAG_PAGES) seq_printf(m, "nr_base_pages: %d\n", nr_base_pages); seq_putc(m, '\n'); return 0; } static void stack_stop(struct seq_file *m, void *v) { } static const struct seq_operations page_owner_stack_op = { .start = stack_start, .next = stack_next, .stop = stack_stop, .show = stack_print }; static int page_owner_stack_open(struct inode *inode, struct file *file) { int ret = seq_open_private(file, &page_owner_stack_op, sizeof(struct stack_print_ctx)); if (!ret) { struct seq_file *m = file->private_data; struct stack_print_ctx *ctx = m->private; ctx->flags = (uintptr_t) inode->i_private; } return ret; } static const struct file_operations page_owner_stack_fops = { .open = page_owner_stack_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; static int page_owner_threshold_get(void *data, u64 *val) { *val = READ_ONCE(page_owner_pages_threshold); return 0; } static int page_owner_threshold_set(void *data, u64 val) { WRITE_ONCE(page_owner_pages_threshold, val); return 0; } DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get, &page_owner_threshold_set, "%llu"); static int __init pageowner_init(void) { struct dentry *dir; if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops); dir = debugfs_create_dir("page_owner_stacks", NULL); debugfs_create_file("show_stacks", 0400, dir, (void *)(STACK_PRINT_FLAG_STACK | STACK_PRINT_FLAG_PAGES), &page_owner_stack_fops); debugfs_create_file("show_handles", 0400, dir, (void *)(STACK_PRINT_FLAG_HANDLE | STACK_PRINT_FLAG_PAGES), &page_owner_stack_fops); debugfs_create_file("show_stacks_handles", 0400, dir, (void *)(STACK_PRINT_FLAG_STACK | STACK_PRINT_FLAG_HANDLE), &page_owner_stack_fops); debugfs_create_file("count_threshold", 0600, dir, NULL, &page_owner_threshold_fops); return 0; } late_initcall(pageowner_init)
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux I2C core * * Copyright (C) 1995-99 Simon G. Vogl * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> * Mux support by Rodolfo Giometti <giometti@enneenne.com> and * Michael Lawnick <michael.lawnick.ext@nsn.com> * * Copyright (C) 2013-2017 Wolfram Sang <wsa@kernel.org> */ #define pr_fmt(fmt) "i2c-core: " fmt #include <dt-bindings/i2c/i2c.h> #include <linux/acpi.h> #include <linux/clk/clk-conf.h> #include <linux/completion.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/gpio/consumer.h> #include <linux/i2c.h> #include <linux/i2c-smbus.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of_device.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> #include <linux/pinctrl/devinfo.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/property.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string_choices.h> #include "i2c-core.h" #define CREATE_TRACE_POINTS #include <trace/events/i2c.h> #define I2C_ADDR_OFFSET_TEN_BIT 0xa000 #define I2C_ADDR_OFFSET_SLAVE 0x1000 #define I2C_ADDR_7BITS_MAX 0x77 #define I2C_ADDR_7BITS_COUNT (I2C_ADDR_7BITS_MAX + 1) #define I2C_ADDR_DEVICE_ID 0x7c /* * core_lock protects i2c_adapter_idr, and guarantees that device detection, * deletion of detected devices are serialized */ static DEFINE_MUTEX(core_lock); static DEFINE_IDR(i2c_adapter_idr); static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver); static DEFINE_STATIC_KEY_FALSE(i2c_trace_msg_key); static bool is_registered; static struct dentry *i2c_debugfs_root; int i2c_transfer_trace_reg(void) { static_branch_inc(&i2c_trace_msg_key); return 0; } void i2c_transfer_trace_unreg(void) { static_branch_dec(&i2c_trace_msg_key); } const char *i2c_freq_mode_string(u32 bus_freq_hz) { switch (bus_freq_hz) { case I2C_MAX_STANDARD_MODE_FREQ: return "Standard Mode (100 kHz)"; case I2C_MAX_FAST_MODE_FREQ: return "Fast Mode (400 kHz)"; case I2C_MAX_FAST_MODE_PLUS_FREQ: return "Fast Mode Plus (1.0 MHz)"; case I2C_MAX_TURBO_MODE_FREQ: return "Turbo Mode (1.4 MHz)"; case I2C_MAX_HIGH_SPEED_MODE_FREQ: return "High Speed Mode (3.4 MHz)"; case I2C_MAX_ULTRA_FAST_MODE_FREQ: return "Ultra Fast Mode (5.0 MHz)"; default: return "Unknown Mode"; } } EXPORT_SYMBOL_GPL(i2c_freq_mode_string); const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id, const struct i2c_client *client) { if (!(id && client)) return NULL; while (id->name[0]) { if (strcmp(client->name, id->name) == 0) return id; id++; } return NULL; } EXPORT_SYMBOL_GPL(i2c_match_id); const void *i2c_get_match_data(const struct i2c_client *client) { struct i2c_driver *driver = to_i2c_driver(client->dev.driver); const struct i2c_device_id *match; const void *data; data = device_get_match_data(&client->dev); if (!data) { match = i2c_match_id(driver->id_table, client); if (!match) return NULL; data = (const void *)match->driver_data; } return data; } EXPORT_SYMBOL(i2c_get_match_data); static int i2c_device_match(struct device *dev, const struct device_driver *drv) { struct i2c_client *client = i2c_verify_client(dev); const struct i2c_driver *driver; /* Attempt an OF style match */ if (i2c_of_match_device(drv->of_match_table, client)) return 1; /* Then ACPI style match */ if (acpi_driver_match_device(dev, drv)) return 1; driver = to_i2c_driver(drv); /* Finally an I2C match */ if (i2c_match_id(driver->id_table, client)) return 1; return 0; } static int i2c_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct i2c_client *client = to_i2c_client(dev); int rc; rc = of_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; rc = acpi_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; return add_uevent_var(env, "MODALIAS=%s%s", I2C_MODULE_PREFIX, client->name); } /* i2c bus recovery routines */ static int get_scl_gpio_value(struct i2c_adapter *adap) { return gpiod_get_value_cansleep(adap->bus_recovery_info->scl_gpiod); } static void set_scl_gpio_value(struct i2c_adapter *adap, int val) { gpiod_set_value_cansleep(adap->bus_recovery_info->scl_gpiod, val); } static int get_sda_gpio_value(struct i2c_adapter *adap) { return gpiod_get_value_cansleep(adap->bus_recovery_info->sda_gpiod); } static void set_sda_gpio_value(struct i2c_adapter *adap, int val) { gpiod_set_value_cansleep(adap->bus_recovery_info->sda_gpiod, val); } static int i2c_generic_bus_free(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; int ret = -EOPNOTSUPP; if (bri->get_bus_free) ret = bri->get_bus_free(adap); else if (bri->get_sda) ret = bri->get_sda(adap); if (ret < 0) return ret; return ret ? 0 : -EBUSY; } /* * We are generating clock pulses. ndelay() determines durating of clk pulses. * We will generate clock with rate 100 KHz and so duration of both clock levels * is: delay in ns = (10^6 / 100) / 2 */ #define RECOVERY_NDELAY 5000 #define RECOVERY_CLK_CNT 9 int i2c_generic_scl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; int i = 0, scl = 1, ret = 0; if (bri->prepare_recovery) bri->prepare_recovery(adap); if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_gpio); /* * If we can set SDA, we will always create a STOP to ensure additional * pulses will do no harm. This is achieved by letting SDA follow SCL * half a cycle later. Check the 'incomplete_write_byte' fault injector * for details. Note that we must honour tsu:sto, 4us, but lets use 5us * here for simplicity. */ bri->set_scl(adap, scl); ndelay(RECOVERY_NDELAY); if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); /* * By this time SCL is high, as we need to give 9 falling-rising edges */ while (i++ < RECOVERY_CLK_CNT * 2) { if (scl) { /* SCL shouldn't be low here */ if (!bri->get_scl(adap)) { dev_err(&adap->dev, "SCL is stuck low, exit recovery\n"); ret = -EBUSY; break; } } scl = !scl; bri->set_scl(adap, scl); /* Creating STOP again, see above */ if (scl) { /* Honour minimum tsu:sto */ ndelay(RECOVERY_NDELAY); } else { /* Honour minimum tf and thd:dat */ ndelay(RECOVERY_NDELAY / 2); } if (bri->set_sda) bri->set_sda(adap, scl); ndelay(RECOVERY_NDELAY / 2); if (scl) { ret = i2c_generic_bus_free(adap); if (ret == 0) break; } } /* If we can't check bus status, assume recovery worked */ if (ret == -EOPNOTSUPP) ret = 0; if (bri->unprepare_recovery) bri->unprepare_recovery(adap); if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_default); return ret; } EXPORT_SYMBOL_GPL(i2c_generic_scl_recovery); int i2c_recover_bus(struct i2c_adapter *adap) { if (!adap->bus_recovery_info) return -EBUSY; dev_dbg(&adap->dev, "Trying i2c bus recovery\n"); return adap->bus_recovery_info->recover_bus(adap); } EXPORT_SYMBOL_GPL(i2c_recover_bus); static void i2c_gpio_init_pinctrl_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; struct device *dev = &adap->dev; struct pinctrl *p = bri->pinctrl ?: dev_pinctrl(dev->parent); bri->pinctrl = p; /* * we can't change states without pinctrl, so remove the states if * populated */ if (!p) { bri->pins_default = NULL; bri->pins_gpio = NULL; return; } if (!bri->pins_default) { bri->pins_default = pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT); if (IS_ERR(bri->pins_default)) { dev_dbg(dev, PINCTRL_STATE_DEFAULT " state not found for GPIO recovery\n"); bri->pins_default = NULL; } } if (!bri->pins_gpio) { bri->pins_gpio = pinctrl_lookup_state(p, "gpio"); if (IS_ERR(bri->pins_gpio)) bri->pins_gpio = pinctrl_lookup_state(p, "recovery"); if (IS_ERR(bri->pins_gpio)) { dev_dbg(dev, "no gpio or recovery state found for GPIO recovery\n"); bri->pins_gpio = NULL; } } /* for pinctrl state changes, we need all the information */ if (bri->pins_default && bri->pins_gpio) { dev_info(dev, "using pinctrl states for GPIO recovery"); } else { bri->pinctrl = NULL; bri->pins_default = NULL; bri->pins_gpio = NULL; } } static int i2c_gpio_init_generic_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; struct device *dev = &adap->dev; struct gpio_desc *gpiod; int ret = 0; /* * don't touch the recovery information if the driver is not using * generic SCL recovery */ if (bri->recover_bus && bri->recover_bus != i2c_generic_scl_recovery) return 0; /* * pins might be taken as GPIO, so we should inform pinctrl about * this and move the state to GPIO */ if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_gpio); /* * if there is incomplete or no recovery information, see if generic * GPIO recovery is available */ if (!bri->scl_gpiod) { gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); if (PTR_ERR(gpiod) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto cleanup_pinctrl_state; } if (!IS_ERR(gpiod)) { bri->scl_gpiod = gpiod; bri->recover_bus = i2c_generic_scl_recovery; dev_info(dev, "using generic GPIOs for recovery\n"); } } /* SDA GPIOD line is optional, so we care about DEFER only */ if (!bri->sda_gpiod) { /* * We have SCL. Pull SCL low and wait a bit so that SDA glitches * have no effect. */ gpiod_direction_output(bri->scl_gpiod, 0); udelay(10); gpiod = devm_gpiod_get(dev, "sda", GPIOD_IN); /* Wait a bit in case of a SDA glitch, and then release SCL. */ udelay(10); gpiod_direction_output(bri->scl_gpiod, 1); if (PTR_ERR(gpiod) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto cleanup_pinctrl_state; } if (!IS_ERR(gpiod)) bri->sda_gpiod = gpiod; } cleanup_pinctrl_state: /* change the state of the pins back to their default state */ if (bri->pinctrl) pinctrl_select_state(bri->pinctrl, bri->pins_default); return ret; } static int i2c_gpio_init_recovery(struct i2c_adapter *adap) { i2c_gpio_init_pinctrl_recovery(adap); return i2c_gpio_init_generic_recovery(adap); } static int i2c_init_recovery(struct i2c_adapter *adap) { struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; bool is_error_level = true; char *err_str; if (!bri) return 0; if (i2c_gpio_init_recovery(adap) == -EPROBE_DEFER) return -EPROBE_DEFER; if (!bri->recover_bus) { err_str = "no suitable method provided"; is_error_level = false; goto err; } if (bri->scl_gpiod && bri->recover_bus == i2c_generic_scl_recovery) { bri->get_scl = get_scl_gpio_value; bri->set_scl = set_scl_gpio_value; if (bri->sda_gpiod) { bri->get_sda = get_sda_gpio_value; /* FIXME: add proper flag instead of '0' once available */ if (gpiod_get_direction(bri->sda_gpiod) == 0) bri->set_sda = set_sda_gpio_value; } } else if (bri->recover_bus == i2c_generic_scl_recovery) { /* Generic SCL recovery */ if (!bri->set_scl || !bri->get_scl) { err_str = "no {get|set}_scl() found"; goto err; } if (!bri->set_sda && !bri->get_sda) { err_str = "either get_sda() or set_sda() needed"; goto err; } } return 0; err: if (is_error_level) dev_err(&adap->dev, "Not using recovery: %s\n", err_str); else dev_dbg(&adap->dev, "Not using recovery: %s\n", err_str); adap->bus_recovery_info = NULL; return -EINVAL; } static int i2c_smbus_host_notify_to_irq(const struct i2c_client *client) { struct i2c_adapter *adap = client->adapter; unsigned int irq; if (!adap->host_notify_domain) return -ENXIO; if (client->flags & I2C_CLIENT_TEN) return -EINVAL; irq = irq_create_mapping(adap->host_notify_domain, client->addr); return irq > 0 ? irq : -ENXIO; } static int i2c_device_probe(struct device *dev) { struct fwnode_handle *fwnode = dev_fwnode(dev); struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; bool do_power_on; int status; if (!client) return 0; client->irq = client->init_irq; if (!client->irq) { int irq = -ENOENT; if (client->flags & I2C_CLIENT_HOST_NOTIFY) { dev_dbg(dev, "Using Host Notify IRQ\n"); /* Keep adapter active when Host Notify is required */ pm_runtime_get_sync(&client->adapter->dev); irq = i2c_smbus_host_notify_to_irq(client); } else if (is_of_node(fwnode)) { irq = fwnode_irq_get_byname(fwnode, "irq"); if (irq == -EINVAL || irq == -ENODATA) irq = fwnode_irq_get(fwnode, 0); } else if (is_acpi_device_node(fwnode)) { bool wake_capable; irq = i2c_acpi_get_irq(client, &wake_capable); if (irq > 0 && wake_capable) client->flags |= I2C_CLIENT_WAKE; } if (irq == -EPROBE_DEFER) { status = dev_err_probe(dev, irq, "can't get irq\n"); goto put_sync_adapter; } if (irq < 0) irq = 0; client->irq = irq; } driver = to_i2c_driver(dev->driver); /* * An I2C ID table is not mandatory, if and only if, a suitable OF * or ACPI ID table is supplied for the probing device. */ if (!driver->id_table && !acpi_driver_match_device(dev, dev->driver) && !i2c_of_match_device(dev->driver->of_match_table, client)) { status = -ENODEV; goto put_sync_adapter; } if (client->flags & I2C_CLIENT_WAKE) { int wakeirq; wakeirq = fwnode_irq_get_byname(fwnode, "wakeup"); if (wakeirq == -EPROBE_DEFER) { status = dev_err_probe(dev, wakeirq, "can't get wakeirq\n"); goto put_sync_adapter; } device_init_wakeup(&client->dev, true); if (wakeirq > 0 && wakeirq != client->irq) status = dev_pm_set_dedicated_wake_irq(dev, wakeirq); else if (client->irq > 0) status = dev_pm_set_wake_irq(dev, client->irq); else status = 0; if (status) dev_warn(&client->dev, "failed to set up wakeup irq\n"); } dev_dbg(dev, "probe\n"); status = of_clk_set_defaults(to_of_node(fwnode), false); if (status < 0) goto err_clear_wakeup_irq; do_power_on = !i2c_acpi_waive_d0_probe(dev); status = dev_pm_domain_attach(&client->dev, PD_FLAG_DETACH_POWER_OFF | (do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0)); if (status) goto err_clear_wakeup_irq; client->devres_group_id = devres_open_group(&client->dev, NULL, GFP_KERNEL); if (!client->devres_group_id) { status = -ENOMEM; goto err_clear_wakeup_irq; } client->debugfs = debugfs_create_dir(dev_name(&client->dev), client->adapter->debugfs); if (driver->probe) status = driver->probe(client); else status = -EINVAL; /* * Note that we are not closing the devres group opened above so * even resources that were attached to the device after probe is * run are released when i2c_device_remove() is executed. This is * needed as some drivers would allocate additional resources, * for example when updating firmware. */ if (status) goto err_release_driver_resources; return 0; err_release_driver_resources: debugfs_remove_recursive(client->debugfs); devres_release_group(&client->dev, client->devres_group_id); err_clear_wakeup_irq: dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); put_sync_adapter: if (client->flags & I2C_CLIENT_HOST_NOTIFY) pm_runtime_put_sync(&client->adapter->dev); return status; } static void i2c_device_remove(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); struct i2c_driver *driver; driver = to_i2c_driver(dev->driver); if (driver->remove) { dev_dbg(dev, "remove\n"); driver->remove(client); } debugfs_remove_recursive(client->debugfs); devres_release_group(&client->dev, client->devres_group_id); dev_pm_clear_wake_irq(&client->dev); device_init_wakeup(&client->dev, false); client->irq = 0; if (client->flags & I2C_CLIENT_HOST_NOTIFY) pm_runtime_put(&client->adapter->dev); } static void i2c_device_shutdown(struct device *dev) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_driver *driver; if (!client || !dev->driver) return; driver = to_i2c_driver(dev->driver); if (driver->shutdown) driver->shutdown(client); else if (client->irq > 0) disable_irq(client->irq); } static void i2c_client_dev_release(struct device *dev) { kfree(to_i2c_client(dev)); } static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", dev->type == &i2c_client_type ? to_i2c_client(dev)->name : to_i2c_adapter(dev)->name); } static DEVICE_ATTR_RO(name); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_client *client = to_i2c_client(dev); int len; len = of_device_modalias(dev, buf, PAGE_SIZE); if (len != -ENODEV) return len; len = acpi_device_modalias(dev, buf, PAGE_SIZE - 1); if (len != -ENODEV) return len; return sprintf(buf, "%s%s\n", I2C_MODULE_PREFIX, client->name); } static DEVICE_ATTR_RO(modalias); static struct attribute *i2c_dev_attrs[] = { &dev_attr_name.attr, /* modalias helps coldplug: modprobe $(cat .../modalias) */ &dev_attr_modalias.attr, NULL }; ATTRIBUTE_GROUPS(i2c_dev); const struct bus_type i2c_bus_type = { .name = "i2c", .match = i2c_device_match, .probe = i2c_device_probe, .remove = i2c_device_remove, .shutdown = i2c_device_shutdown, }; EXPORT_SYMBOL_GPL(i2c_bus_type); const struct device_type i2c_client_type = { .groups = i2c_dev_groups, .uevent = i2c_device_uevent, .release = i2c_client_dev_release, }; EXPORT_SYMBOL_GPL(i2c_client_type); /** * i2c_verify_client - return parameter as i2c_client, or NULL * @dev: device, probably from some driver model iterator * * When traversing the driver model tree, perhaps using driver model * iterators like @device_for_each_child(), you can't assume very much * about the nodes you find. Use this function to avoid oopses caused * by wrongly treating some non-I2C device as an i2c_client. */ struct i2c_client *i2c_verify_client(struct device *dev) { return (dev->type == &i2c_client_type) ? to_i2c_client(dev) : NULL; } EXPORT_SYMBOL(i2c_verify_client); /* Return a unique address which takes the flags of the client into account */ static unsigned short i2c_encode_flags_to_addr(struct i2c_client *client) { unsigned short addr = client->addr; /* For some client flags, add an arbitrary offset to avoid collisions */ if (client->flags & I2C_CLIENT_TEN) addr |= I2C_ADDR_OFFSET_TEN_BIT; if (client->flags & I2C_CLIENT_SLAVE) addr |= I2C_ADDR_OFFSET_SLAVE; return addr; } /* This is a permissive address validity check, I2C address map constraints * are purposely not enforced, except for the general call address. */ static int i2c_check_addr_validity(unsigned int addr, unsigned short flags) { if (flags & I2C_CLIENT_TEN) { /* 10-bit address, all values are valid */ if (addr > 0x3ff) return -EINVAL; } else { /* 7-bit address, reject the general call address */ if (addr == 0x00 || addr > 0x7f) return -EINVAL; } return 0; } /* And this is a strict address validity check, used when probing. If a * device uses a reserved address, then it shouldn't be probed. 7-bit * addressing is assumed, 10-bit address devices are rare and should be * explicitly enumerated. */ int i2c_check_7bit_addr_validity_strict(unsigned short addr) { /* * Reserved addresses per I2C specification: * 0x00 General call address / START byte * 0x01 CBUS address * 0x02 Reserved for different bus format * 0x03 Reserved for future purposes * 0x04-0x07 Hs-mode master code * 0x78-0x7b 10-bit slave addressing * 0x7c-0x7f Reserved for future purposes */ if (addr < 0x08 || addr > 0x77) return -EINVAL; return 0; } static int __i2c_check_addr_busy(struct device *dev, void *addrp) { struct i2c_client *client = i2c_verify_client(dev); int addr = *(int *)addrp; if (client && i2c_encode_flags_to_addr(client) == addr) return -EBUSY; return 0; } /* walk up mux tree */ static int i2c_check_mux_parents(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result; result = device_for_each_child(&adapter->dev, &addr, __i2c_check_addr_busy); if (!result && parent) result = i2c_check_mux_parents(parent, addr); return result; } /* recurse down mux tree */ static int i2c_check_mux_children(struct device *dev, void *addrp) { int result; if (dev->type == &i2c_adapter_type) result = device_for_each_child(dev, addrp, i2c_check_mux_children); else result = __i2c_check_addr_busy(dev, addrp); return result; } static int i2c_check_addr_busy(struct i2c_adapter *adapter, int addr) { struct i2c_adapter *parent = i2c_parent_is_i2c_adapter(adapter); int result = 0; if (parent) result = i2c_check_mux_parents(parent, addr); if (!result) result = device_for_each_child(&adapter->dev, &addr, i2c_check_mux_children); return result; } /** * i2c_adapter_lock_bus - Get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT * locks only this branch in the adapter tree */ static void i2c_adapter_lock_bus(struct i2c_adapter *adapter, unsigned int flags) { rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); } /** * i2c_adapter_trylock_bus - Try to get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER trylocks the root i2c adapter, I2C_LOCK_SEGMENT * trylocks only this branch in the adapter tree */ static int i2c_adapter_trylock_bus(struct i2c_adapter *adapter, unsigned int flags) { return rt_mutex_trylock(&adapter->bus_lock); } /** * i2c_adapter_unlock_bus - Release exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT * unlocks only this branch in the adapter tree */ static void i2c_adapter_unlock_bus(struct i2c_adapter *adapter, unsigned int flags) { rt_mutex_unlock(&adapter->bus_lock); } static void i2c_dev_set_name(struct i2c_adapter *adap, struct i2c_client *client, struct i2c_board_info const *info) { struct acpi_device *adev = ACPI_COMPANION(&client->dev); if (info && info->dev_name) { dev_set_name(&client->dev, "i2c-%s", info->dev_name); return; } if (adev) { dev_set_name(&client->dev, "i2c-%s", acpi_dev_name(adev)); return; } dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), i2c_encode_flags_to_addr(client)); } int i2c_dev_irq_from_resources(const struct resource *resources, unsigned int num_resources) { struct irq_data *irqd; int i; for (i = 0; i < num_resources; i++) { const struct resource *r = &resources[i]; if (resource_type(r) != IORESOURCE_IRQ) continue; if (r->flags & IORESOURCE_BITS) { irqd = irq_get_irq_data(r->start); if (!irqd) break; irqd_set_trigger_type(irqd, r->flags & IORESOURCE_BITS); } return r->start; } return 0; } /* * Serialize device instantiation in case it can be instantiated explicitly * and by auto-detection */ static int i2c_lock_addr(struct i2c_adapter *adap, unsigned short addr, unsigned short flags) { if (!(flags & I2C_CLIENT_TEN) && test_and_set_bit(addr, adap->addrs_in_instantiation)) return -EBUSY; return 0; } static void i2c_unlock_addr(struct i2c_adapter *adap, unsigned short addr, unsigned short flags) { if (!(flags & I2C_CLIENT_TEN)) clear_bit(addr, adap->addrs_in_instantiation); } /** * i2c_new_client_device - instantiate an i2c device * @adap: the adapter managing the device * @info: describes one I2C device; bus_num is ignored * Context: can sleep * * Create an i2c device. Binding is handled through driver model * probe()/remove() methods. A driver may be bound to this device when we * return from this function, or any later moment (e.g. maybe hotplugging will * load the driver module). This call is not appropriate for use by mainboard * initialization logic, which usually runs during an arch_initcall() long * before any i2c_adapter could exist. * * This returns the new i2c client, which may be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client * i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info) { struct fwnode_handle *fwnode = info->fwnode; struct i2c_client *client; bool need_put = false; int status; client = kzalloc_obj(*client); if (!client) return ERR_PTR(-ENOMEM); client->adapter = adap; client->dev.platform_data = info->platform_data; client->flags = info->flags; client->addr = info->addr; client->init_irq = info->irq; if (!client->init_irq) client->init_irq = i2c_dev_irq_from_resources(info->resources, info->num_resources); strscpy(client->name, info->type, sizeof(client->name)); status = i2c_check_addr_validity(client->addr, client->flags); if (status) { dev_err(&adap->dev, "Invalid %d-bit I2C address 0x%02hx\n", client->flags & I2C_CLIENT_TEN ? 10 : 7, client->addr); goto out_err_silent; } status = i2c_lock_addr(adap, client->addr, client->flags); if (status) goto out_err_silent; /* Check for address business */ status = i2c_check_addr_busy(adap, i2c_encode_flags_to_addr(client)); if (status) goto out_err; client->dev.parent = &client->adapter->dev; client->dev.bus = &i2c_bus_type; client->dev.type = &i2c_client_type; device_enable_async_suspend(&client->dev); device_set_node(&client->dev, fwnode_handle_get(fwnode)); if (info->swnode) { status = device_add_software_node(&client->dev, info->swnode); if (status) { dev_err(&adap->dev, "Failed to add software node to client %s: %d\n", client->name, status); goto out_err_put_fwnode; } } i2c_dev_set_name(adap, client, info); status = device_register(&client->dev); if (status) goto out_remove_swnode; dev_dbg(&adap->dev, "client [%s] registered with bus id %s\n", client->name, dev_name(&client->dev)); i2c_unlock_addr(adap, client->addr, client->flags); return client; out_remove_swnode: device_remove_software_node(&client->dev); need_put = true; out_err_put_fwnode: fwnode_handle_put(fwnode); out_err: dev_err(&adap->dev, "Failed to register i2c client %s at 0x%02x (%d)\n", client->name, client->addr, status); i2c_unlock_addr(adap, client->addr, client->flags); out_err_silent: if (need_put) put_device(&client->dev); else kfree(client); return ERR_PTR(status); } EXPORT_SYMBOL_GPL(i2c_new_client_device); /** * i2c_unregister_device - reverse effect of i2c_new_*_device() * @client: value returned from i2c_new_*_device() * Context: can sleep */ void i2c_unregister_device(struct i2c_client *client) { struct fwnode_handle *fwnode; if (IS_ERR_OR_NULL(client)) return; fwnode = dev_fwnode(&client->dev); if (is_of_node(fwnode)) of_node_clear_flag(to_of_node(fwnode), OF_POPULATED); else if (is_acpi_device_node(fwnode)) acpi_device_clear_enumerated(to_acpi_device_node(fwnode)); /* * If the primary fwnode is a software node it is free-ed by * device_remove_software_node() below, avoid double-free. */ if (!is_software_node(fwnode)) fwnode_handle_put(fwnode); device_remove_software_node(&client->dev); device_unregister(&client->dev); } EXPORT_SYMBOL_GPL(i2c_unregister_device); /** * i2c_find_device_by_fwnode() - find an i2c_client for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_client * * Look up and return the &struct i2c_client corresponding to the @fwnode. * If no client can be found, or @fwnode is NULL, this returns NULL. * * The user must call put_device(&client->dev) once done with the i2c client. */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_client *client; struct device *dev; if (IS_ERR_OR_NULL(fwnode)) return NULL; dev = bus_find_device_by_fwnode(&i2c_bus_type, fwnode); if (!dev) return NULL; client = i2c_verify_client(dev); if (!client) put_device(dev); return client; } EXPORT_SYMBOL(i2c_find_device_by_fwnode); static const struct i2c_device_id dummy_id[] = { { "dummy", }, { "smbus_host_notify", }, { } }; static int dummy_probe(struct i2c_client *client) { return 0; } static struct i2c_driver dummy_driver = { .driver.name = "dummy", .probe = dummy_probe, .id_table = dummy_id, }; /** * i2c_new_dummy_device - return a new i2c device bound to a dummy driver * @adapter: the adapter managing the device * @address: seven bit address to be used * Context: can sleep * * This returns an I2C client bound to the "dummy" driver, intended for use * with devices that consume multiple addresses. Examples of such chips * include various EEPROMS (like 24c04 and 24c08 models). * * These dummy devices have two main uses. First, most I2C and SMBus calls * except i2c_transfer() need a client handle; the dummy will be that handle. * And second, this prevents the specified address from being bound to a * different driver. * * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address) { struct i2c_board_info info = { I2C_BOARD_INFO("dummy", address), }; return i2c_new_client_device(adapter, &info); } EXPORT_SYMBOL_GPL(i2c_new_dummy_device); static void devm_i2c_release_dummy(void *client) { i2c_unregister_device(client); } /** * devm_i2c_new_dummy_device - return a new i2c device bound to a dummy driver * @dev: device the managed resource is bound to * @adapter: the adapter managing the device * @address: seven bit address to be used * Context: can sleep * * This is the device-managed version of @i2c_new_dummy_device. It returns the * new i2c client or an ERR_PTR in case of an error. */ struct i2c_client *devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adapter, u16 address) { struct i2c_client *client; int ret; client = i2c_new_dummy_device(adapter, address); if (IS_ERR(client)) return client; ret = devm_add_action_or_reset(dev, devm_i2c_release_dummy, client); if (ret) return ERR_PTR(ret); return client; } EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); /** * i2c_new_ancillary_device - Helper to get the instantiated secondary address * and create the associated device * @client: Handle to the primary client * @name: Handle to specify which secondary address to get * @default_addr: Used as a fallback if no secondary address was specified * Context: can sleep * * I2C clients can be composed of multiple I2C slaves bound together in a single * component. The I2C client driver then binds to the master I2C slave and needs * to create I2C dummy clients to communicate with all the other slaves. * * This function creates and returns an I2C dummy client whose I2C address is * retrieved from the platform firmware based on the given slave name. If no * address is specified by the firmware default_addr is used. * * On DT-based platforms the address is retrieved from the "reg" property entry * cell whose "reg-names" value matches the slave name. * * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr) { struct device_node *np = client->dev.of_node; u32 addr = default_addr; int i; i = of_property_match_string(np, "reg-names", name); if (i >= 0) of_property_read_u32_index(np, "reg", i, &addr); dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr); return i2c_new_dummy_device(client->adapter, addr); } EXPORT_SYMBOL_GPL(i2c_new_ancillary_device); /* ------------------------------------------------------------------------- */ /* I2C bus adapters -- one roots each I2C or SMBUS segment */ static void i2c_adapter_dev_release(struct device *dev) { struct i2c_adapter *adap = to_i2c_adapter(dev); complete(&adap->dev_released); } unsigned int i2c_adapter_depth(struct i2c_adapter *adapter) { unsigned int depth = 0; struct device *parent; for (parent = adapter->dev.parent; parent; parent = parent->parent) if (parent->type == &i2c_adapter_type) depth++; WARN_ONCE(depth >= MAX_LOCKDEP_SUBCLASSES, "adapter depth exceeds lockdep subclass limit\n"); return depth; } EXPORT_SYMBOL_GPL(i2c_adapter_depth); /* * Let users instantiate I2C devices through sysfs. This can be used when * platform initialization code doesn't contain the proper data for * whatever reason. Also useful for drivers that do device detection and * detection fails, either because the device uses an unexpected address, * or this is a compatible device with different ID register values. * * Parameter checking may look overzealous, but we really don't want * the user to provide incorrect parameters. */ static ssize_t new_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); struct i2c_board_info info; struct i2c_client *client; char *blank, end; int res; memset(&info, 0, sizeof(struct i2c_board_info)); blank = strchr(buf, ' '); if (!blank) { dev_err(dev, "%s: Missing parameters\n", "new_device"); return -EINVAL; } if (blank - buf > I2C_NAME_SIZE - 1) { dev_err(dev, "%s: Invalid device name\n", "new_device"); return -EINVAL; } memcpy(info.type, buf, blank - buf); /* Parse remaining parameters, reject extra parameters */ res = sscanf(++blank, "%hi%c", &info.addr, &end); if (res < 1) { dev_err(dev, "%s: Can't parse I2C address\n", "new_device"); return -EINVAL; } if (res > 1 && end != '\n') { dev_err(dev, "%s: Extra parameters\n", "new_device"); return -EINVAL; } if ((info.addr & I2C_ADDR_OFFSET_TEN_BIT) == I2C_ADDR_OFFSET_TEN_BIT) { info.addr &= ~I2C_ADDR_OFFSET_TEN_BIT; info.flags |= I2C_CLIENT_TEN; } if (info.addr & I2C_ADDR_OFFSET_SLAVE) { info.addr &= ~I2C_ADDR_OFFSET_SLAVE; info.flags |= I2C_CLIENT_SLAVE; } client = i2c_new_client_device(adap, &info); if (IS_ERR(client)) return PTR_ERR(client); /* Keep track of the added device */ mutex_lock(&adap->userspace_clients_lock); list_add_tail(&client->detected, &adap->userspace_clients); mutex_unlock(&adap->userspace_clients_lock); dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device", info.type, info.addr); return count; } static DEVICE_ATTR_WO(new_device); /* * And of course let the users delete the devices they instantiated, if * they got it wrong. This interface can only be used to delete devices * instantiated by i2c_sysfs_new_device above. This guarantees that we * don't delete devices to which some kernel code still has references. * * Parameter checking may look overzealous, but we really don't want * the user to delete the wrong device. */ static ssize_t delete_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); struct i2c_client *client, *next; unsigned short addr; char end; int res; /* Parse parameters, reject extra parameters */ res = sscanf(buf, "%hi%c", &addr, &end); if (res < 1) { dev_err(dev, "%s: Can't parse I2C address\n", "delete_device"); return -EINVAL; } if (res > 1 && end != '\n') { dev_err(dev, "%s: Extra parameters\n", "delete_device"); return -EINVAL; } /* Make sure the device was added through sysfs */ res = -ENOENT; mutex_lock_nested(&adap->userspace_clients_lock, i2c_adapter_depth(adap)); list_for_each_entry_safe(client, next, &adap->userspace_clients, detected) { if (i2c_encode_flags_to_addr(client) == addr) { dev_info(dev, "%s: Deleting device %s at 0x%02hx\n", "delete_device", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); res = count; break; } } mutex_unlock(&adap->userspace_clients_lock); if (res < 0) dev_err(dev, "%s: Can't find device in list\n", "delete_device"); return res; } static DEVICE_ATTR_IGNORE_LOCKDEP(delete_device, S_IWUSR, NULL, delete_device_store); static struct attribute *i2c_adapter_attrs[] = { &dev_attr_name.attr, &dev_attr_new_device.attr, &dev_attr_delete_device.attr, NULL }; ATTRIBUTE_GROUPS(i2c_adapter); const struct device_type i2c_adapter_type = { .groups = i2c_adapter_groups, .release = i2c_adapter_dev_release, }; EXPORT_SYMBOL_GPL(i2c_adapter_type); /** * i2c_verify_adapter - return parameter as i2c_adapter or NULL * @dev: device, probably from some driver model iterator * * When traversing the driver model tree, perhaps using driver model * iterators like @device_for_each_child(), you can't assume very much * about the nodes you find. Use this function to avoid oopses caused * by wrongly treating some non-I2C device as an i2c_adapter. */ struct i2c_adapter *i2c_verify_adapter(struct device *dev) { return (dev->type == &i2c_adapter_type) ? to_i2c_adapter(dev) : NULL; } EXPORT_SYMBOL(i2c_verify_adapter); static void i2c_scan_static_board_info(struct i2c_adapter *adapter) { struct i2c_devinfo *devinfo; down_read(&__i2c_board_lock); list_for_each_entry(devinfo, &__i2c_board_list, list) { if (devinfo->busnum == adapter->nr && IS_ERR(i2c_new_client_device(adapter, &devinfo->board_info))) dev_err(&adapter->dev, "Can't create device at 0x%02x\n", devinfo->board_info.addr); } up_read(&__i2c_board_lock); } static int i2c_do_add_adapter(struct i2c_driver *driver, struct i2c_adapter *adap) { /* Detect supported devices on that bus, and instantiate them */ i2c_detect(adap, driver); return 0; } static int __process_new_adapter(struct device_driver *d, void *data) { return i2c_do_add_adapter(to_i2c_driver(d), data); } static const struct i2c_lock_operations i2c_adapter_lock_ops = { .lock_bus = i2c_adapter_lock_bus, .trylock_bus = i2c_adapter_trylock_bus, .unlock_bus = i2c_adapter_unlock_bus, }; static void i2c_host_notify_irq_teardown(struct i2c_adapter *adap) { struct irq_domain *domain = adap->host_notify_domain; irq_hw_number_t hwirq; if (!domain) return; for (hwirq = 0 ; hwirq < I2C_ADDR_7BITS_COUNT ; hwirq++) irq_dispose_mapping(irq_find_mapping(domain, hwirq)); irq_domain_remove(domain); adap->host_notify_domain = NULL; } static int i2c_host_notify_irq_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw_irq_num) { irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); return 0; } static const struct irq_domain_ops i2c_host_notify_irq_ops = { .map = i2c_host_notify_irq_map, }; static int i2c_setup_host_notify_irq_domain(struct i2c_adapter *adap) { struct irq_domain *domain; if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_HOST_NOTIFY)) return 0; domain = irq_domain_create_linear(dev_fwnode(adap->dev.parent), I2C_ADDR_7BITS_COUNT, &i2c_host_notify_irq_ops, adap); if (!domain) return -ENOMEM; adap->host_notify_domain = domain; return 0; } /** * i2c_handle_smbus_host_notify - Forward a Host Notify event to the correct * I2C client. * @adap: the adapter * @addr: the I2C address of the notifying device * Context: can't sleep * * Helper function to be called from an I2C bus driver's interrupt * handler. It will schedule the Host Notify IRQ. */ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) { int irq; if (!adap) return -EINVAL; dev_dbg(&adap->dev, "Detected HostNotify from address 0x%02x", addr); irq = irq_find_mapping(adap->host_notify_domain, addr); if (irq <= 0) return -ENXIO; generic_handle_irq_safe(irq); return 0; } EXPORT_SYMBOL_GPL(i2c_handle_smbus_host_notify); static int i2c_register_adapter(struct i2c_adapter *adap) { int res = -EINVAL; /* Can't register until after driver model init */ if (WARN_ON(!is_registered)) { res = -EAGAIN; goto out_list; } /* Sanity checks */ if (WARN(!adap->name[0], "i2c adapter has no name")) goto out_list; if (!adap->algo) { pr_err("adapter '%s': no algo supplied!\n", adap->name); goto out_list; } if (!adap->lock_ops) adap->lock_ops = &i2c_adapter_lock_ops; adap->locked_flags = 0; rt_mutex_init(&adap->bus_lock); rt_mutex_init(&adap->mux_lock); mutex_init(&adap->userspace_clients_lock); INIT_LIST_HEAD(&adap->userspace_clients); /* Set default timeout to 1 second if not already set */ if (adap->timeout == 0) adap->timeout = HZ; /* register soft irqs for Host Notify */ res = i2c_setup_host_notify_irq_domain(adap); if (res) { pr_err("adapter '%s': can't create Host Notify IRQs (%d)\n", adap->name, res); goto out_list; } dev_set_name(&adap->dev, "i2c-%d", adap->nr); adap->dev.bus = &i2c_bus_type; adap->dev.type = &i2c_adapter_type; device_initialize(&adap->dev); /* * This adapter can be used as a parent immediately after device_add(), * setup runtime-pm (especially ignore-children) before hand. */ device_enable_async_suspend(&adap->dev); pm_runtime_no_callbacks(&adap->dev); pm_suspend_ignore_children(&adap->dev, true); pm_runtime_enable(&adap->dev); res = device_add(&adap->dev); if (res) { pr_err("adapter '%s': can't register device (%d)\n", adap->name, res); put_device(&adap->dev); goto out_list; } adap->debugfs = debugfs_create_dir(dev_name(&adap->dev), i2c_debugfs_root); res = i2c_setup_smbus_alert(adap); if (res) goto out_reg; res = i2c_init_recovery(adap); if (res == -EPROBE_DEFER) goto out_reg; dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name); /* create pre-declared device nodes */ of_i2c_register_devices(adap); i2c_acpi_install_space_handler(adap); i2c_acpi_register_devices(adap); if (adap->nr < __i2c_first_dynamic_bus_num) i2c_scan_static_board_info(adap); /* Notify drivers */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_new_adapter); mutex_unlock(&core_lock); return 0; out_reg: debugfs_remove_recursive(adap->debugfs); init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); out_list: mutex_lock(&core_lock); idr_remove(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); return res; } /** * __i2c_add_numbered_adapter - i2c_add_numbered_adapter where nr is never -1 * @adap: the adapter to register (with adap->nr initialized) * Context: can sleep * * See i2c_add_numbered_adapter() for details. */ static int __i2c_add_numbered_adapter(struct i2c_adapter *adap) { int id; mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1, GFP_KERNEL); mutex_unlock(&core_lock); if (WARN(id < 0, "couldn't get idr")) return id == -ENOSPC ? -EBUSY : id; return i2c_register_adapter(adap); } /** * i2c_add_adapter - declare i2c adapter, use dynamic bus number * @adapter: the adapter to add * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number * doesn't matter or when its bus number is specified by an dt alias. * Examples of bases when the bus number doesn't matter: I2C adapters * dynamically added by USB links or PCI plugin cards. * * When this returns zero, a new bus number was allocated and stored * in adap->nr, and the specified adapter became available for clients. * Otherwise, a negative errno value is returned. */ int i2c_add_adapter(struct i2c_adapter *adapter) { struct device *dev = &adapter->dev; int id; id = of_alias_get_id(dev->of_node, "i2c"); if (id >= 0) { adapter->nr = id; return __i2c_add_numbered_adapter(adapter); } mutex_lock(&core_lock); id = idr_alloc(&i2c_adapter_idr, adapter, __i2c_first_dynamic_bus_num, 0, GFP_KERNEL); mutex_unlock(&core_lock); if (WARN(id < 0, "couldn't get idr")) return id; adapter->nr = id; return i2c_register_adapter(adapter); } EXPORT_SYMBOL(i2c_add_adapter); /** * i2c_add_numbered_adapter - declare i2c adapter, use static bus number * @adap: the adapter to register (with adap->nr initialized) * Context: can sleep * * This routine is used to declare an I2C adapter when its bus number * matters. For example, use it for I2C adapters from system-on-chip CPUs, * or otherwise built in to the system's mainboard, and where i2c_board_info * is used to properly configure I2C devices. * * If the requested bus number is set to -1, then this function will behave * identically to i2c_add_adapter, and will dynamically assign a bus number. * * If no devices have pre-been declared for this bus, then be sure to * register the adapter before any dynamically allocated ones. Otherwise * the required bus ID may not be available. * * When this returns zero, the specified adapter became available for * clients using the bus number provided in adap->nr. Also, the table * of I2C devices pre-declared using i2c_register_board_info() is scanned, * and the appropriate driver model device nodes are created. Otherwise, a * negative errno value is returned. */ int i2c_add_numbered_adapter(struct i2c_adapter *adap) { if (adap->nr == -1) /* -1 means dynamically assign bus id */ return i2c_add_adapter(adap); return __i2c_add_numbered_adapter(adap); } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); static void i2c_do_del_adapter(struct i2c_driver *driver, struct i2c_adapter *adapter) { struct i2c_client *client, *_n; /* Remove the devices we created ourselves as the result of hardware * probing (using a driver's detect method) */ list_for_each_entry_safe(client, _n, &driver->clients, detected) { if (client->adapter == adapter) { dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); } } } static int __unregister_client(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); if (client && strcmp(client->name, "dummy")) i2c_unregister_device(client); return 0; } static int __unregister_dummy(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); i2c_unregister_device(client); return 0; } static int __process_removed_adapter(struct device_driver *d, void *data) { i2c_do_del_adapter(to_i2c_driver(d), data); return 0; } /** * i2c_del_adapter - unregister I2C adapter * @adap: the adapter being unregistered * Context: can sleep * * This unregisters an I2C adapter which was previously registered * by @i2c_add_adapter or @i2c_add_numbered_adapter. */ void i2c_del_adapter(struct i2c_adapter *adap) { struct i2c_adapter *found; struct i2c_client *client, *next; /* First make sure that this adapter was ever added */ mutex_lock(&core_lock); found = idr_find(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); if (found != adap) { pr_debug("attempting to delete unregistered adapter [%s]\n", adap->name); return; } i2c_acpi_remove_space_handler(adap); /* Tell drivers about this removal */ mutex_lock(&core_lock); bus_for_each_drv(&i2c_bus_type, NULL, adap, __process_removed_adapter); mutex_unlock(&core_lock); /* Remove devices instantiated from sysfs */ mutex_lock_nested(&adap->userspace_clients_lock, i2c_adapter_depth(adap)); list_for_each_entry_safe(client, next, &adap->userspace_clients, detected) { dev_dbg(&adap->dev, "Removing %s at 0x%x\n", client->name, client->addr); list_del(&client->detected); i2c_unregister_device(client); } mutex_unlock(&adap->userspace_clients_lock); /* Detach any active clients. This can't fail, thus we do not * check the returned value. This is a two-pass process, because * we can't remove the dummy devices during the first pass: they * could have been instantiated by real devices wishing to clean * them up properly, so we give them a chance to do that first. */ device_for_each_child(&adap->dev, NULL, __unregister_client); device_for_each_child(&adap->dev, NULL, __unregister_dummy); /* device name is gone after device_unregister */ dev_dbg(&adap->dev, "adapter [%s] unregistered\n", adap->name); pm_runtime_disable(&adap->dev); i2c_host_notify_irq_teardown(adap); debugfs_remove_recursive(adap->debugfs); /* wait until all references to the device are gone * * FIXME: This is old code and should ideally be replaced by an * alternative which results in decoupling the lifetime of the struct * device from the i2c_adapter, like spi or netdev do. Any solution * should be thoroughly tested with DEBUG_KOBJECT_RELEASE enabled! */ init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); /* free bus id */ mutex_lock(&core_lock); idr_remove(&i2c_adapter_idr, adap->nr); mutex_unlock(&core_lock); /* Clear the device structure in case this adapter is ever going to be added again */ memset(&adap->dev, 0, sizeof(adap->dev)); } EXPORT_SYMBOL(i2c_del_adapter); static void devm_i2c_del_adapter(void *adapter) { i2c_del_adapter(adapter); } /** * devm_i2c_add_adapter - device-managed variant of i2c_add_adapter() * @dev: managing device for adding this I2C adapter * @adapter: the adapter to add * Context: can sleep * * Add adapter with dynamic bus number, same with i2c_add_adapter() * but the adapter will be auto deleted on driver detach. */ int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter) { int ret; ret = i2c_add_adapter(adapter); if (ret) return ret; return devm_add_action_or_reset(dev, devm_i2c_del_adapter, adapter); } EXPORT_SYMBOL_GPL(devm_i2c_add_adapter); static int i2c_dev_or_parent_fwnode_match(struct device *dev, const void *data) { if (device_match_fwnode(dev, data)) return 1; if (dev->parent && device_match_fwnode(dev->parent, data)) return 1; return 0; } /** * i2c_find_adapter_by_fwnode() - find an i2c_adapter for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_adapter * * Look up and return the &struct i2c_adapter corresponding to the @fwnode. * If no adapter can be found, or @fwnode is NULL, this returns NULL. * * The user must call put_device(&adapter->dev) once done with the i2c adapter. */ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_adapter *adapter; struct device *dev; if (IS_ERR_OR_NULL(fwnode)) return NULL; dev = bus_find_device(&i2c_bus_type, NULL, fwnode, i2c_dev_or_parent_fwnode_match); if (!dev) return NULL; adapter = i2c_verify_adapter(dev); if (!adapter) put_device(dev); return adapter; } EXPORT_SYMBOL(i2c_find_adapter_by_fwnode); /** * i2c_get_adapter_by_fwnode() - find an i2c_adapter for the fwnode * @fwnode: &struct fwnode_handle corresponding to the &struct i2c_adapter * * Look up and return the &struct i2c_adapter corresponding to the @fwnode, * and increment the adapter module's use count. If no adapter can be found, * or @fwnode is NULL, this returns NULL. * * The user must call i2c_put_adapter(adapter) once done with the i2c adapter. * Note that this is different from i2c_find_adapter_by_node(). */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) { struct i2c_adapter *adapter; adapter = i2c_find_adapter_by_fwnode(fwnode); if (!adapter) return NULL; if (!try_module_get(adapter->owner)) { put_device(&adapter->dev); adapter = NULL; } return adapter; } EXPORT_SYMBOL(i2c_get_adapter_by_fwnode); static void i2c_parse_timing(struct device *dev, char *prop_name, u32 *cur_val_p, u32 def_val, bool use_def) { int ret; ret = device_property_read_u32(dev, prop_name, cur_val_p); if (ret && use_def) *cur_val_p = def_val; dev_dbg(dev, "%s: %u\n", prop_name, *cur_val_p); } /** * i2c_parse_fw_timings - get I2C related timing parameters from firmware * @dev: The device to scan for I2C timing properties * @t: the i2c_timings struct to be filled with values * @use_defaults: bool to use sane defaults derived from the I2C specification * when properties are not found, otherwise don't update * * Scan the device for the generic I2C properties describing timing parameters * for the signal and fill the given struct with the results. If a property was * not found and use_defaults was true, then maximum timings are assumed which * are derived from the I2C specification. If use_defaults is not used, the * results will be as before, so drivers can apply their own defaults before * calling this helper. The latter is mainly intended for avoiding regressions * of existing drivers which want to switch to this function. New drivers * almost always should use the defaults. */ void i2c_parse_fw_timings(struct device *dev, struct i2c_timings *t, bool use_defaults) { bool u = use_defaults; u32 d; i2c_parse_timing(dev, "clock-frequency", &t->bus_freq_hz, I2C_MAX_STANDARD_MODE_FREQ, u); d = t->bus_freq_hz <= I2C_MAX_STANDARD_MODE_FREQ ? 1000 : t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ ? 300 : 120; i2c_parse_timing(dev, "i2c-scl-rising-time-ns", &t->scl_rise_ns, d, u); d = t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ ? 300 : 120; i2c_parse_timing(dev, "i2c-scl-falling-time-ns", &t->scl_fall_ns, d, u); i2c_parse_timing(dev, "i2c-scl-internal-delay-ns", &t->scl_int_delay_ns, 0, u); i2c_parse_timing(dev, "i2c-sda-falling-time-ns", &t->sda_fall_ns, t->scl_fall_ns, u); i2c_parse_timing(dev, "i2c-sda-hold-time-ns", &t->sda_hold_ns, 0, u); i2c_parse_timing(dev, "i2c-digital-filter-width-ns", &t->digital_filter_width_ns, 0, u); i2c_parse_timing(dev, "i2c-analog-filter-cutoff-frequency", &t->analog_filter_cutoff_freq_hz, 0, u); } EXPORT_SYMBOL_GPL(i2c_parse_fw_timings); /* ------------------------------------------------------------------------- */ int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data)) { int res; mutex_lock(&core_lock); res = bus_for_each_dev(&i2c_bus_type, NULL, data, fn); mutex_unlock(&core_lock); return res; } EXPORT_SYMBOL_GPL(i2c_for_each_dev); static int __process_new_driver(struct device *dev, void *data) { if (dev->type != &i2c_adapter_type) return 0; return i2c_do_add_adapter(data, to_i2c_adapter(dev)); } /* * An i2c_driver is used with one or more i2c_client (device) nodes to access * i2c slave chips, on a bus instance associated with some i2c_adapter. */ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) { int res; /* Can't register until after driver model init */ if (WARN_ON(!is_registered)) return -EAGAIN; /* add the driver to the list of i2c drivers in the driver core */ driver->driver.owner = owner; driver->driver.bus = &i2c_bus_type; INIT_LIST_HEAD(&driver->clients); /* When registration returns, the driver core * will have called probe() for all matching-but-unbound devices. */ res = driver_register(&driver->driver); if (res) return res; pr_debug("driver [%s] registered\n", driver->driver.name); /* Walk the adapters that are already present */ i2c_for_each_dev(driver, __process_new_driver); return 0; } EXPORT_SYMBOL(i2c_register_driver); static int __process_removed_driver(struct device *dev, void *data) { if (dev->type == &i2c_adapter_type) i2c_do_del_adapter(data, to_i2c_adapter(dev)); return 0; } /** * i2c_del_driver - unregister I2C driver * @driver: the driver being unregistered * Context: can sleep */ void i2c_del_driver(struct i2c_driver *driver) { i2c_for_each_dev(driver, __process_removed_driver); driver_unregister(&driver->driver); pr_debug("driver [%s] unregistered\n", driver->driver.name); } EXPORT_SYMBOL(i2c_del_driver); /* ------------------------------------------------------------------------- */ struct i2c_cmd_arg { unsigned cmd; void *arg; }; static int i2c_cmd(struct device *dev, void *_arg) { struct i2c_client *client = i2c_verify_client(dev); struct i2c_cmd_arg *arg = _arg; struct i2c_driver *driver; if (!client || !client->dev.driver) return 0; driver = to_i2c_driver(client->dev.driver); if (driver->command) driver->command(client, arg->cmd, arg->arg); return 0; } void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg) { struct i2c_cmd_arg cmd_arg; cmd_arg.cmd = cmd; cmd_arg.arg = arg; device_for_each_child(&adap->dev, &cmd_arg, i2c_cmd); } EXPORT_SYMBOL(i2c_clients_command); static int __init i2c_init(void) { int retval; retval = of_alias_get_highest_id("i2c"); down_write(&__i2c_board_lock); if (retval >= __i2c_first_dynamic_bus_num) __i2c_first_dynamic_bus_num = retval + 1; up_write(&__i2c_board_lock); retval = bus_register(&i2c_bus_type); if (retval) return retval; is_registered = true; i2c_debugfs_root = debugfs_create_dir("i2c", NULL); retval = i2c_add_driver(&dummy_driver); if (retval) goto class_err; if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_register(&i2c_of_notifier)); if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_register(&i2c_acpi_notifier)); return 0; class_err: is_registered = false; bus_unregister(&i2c_bus_type); return retval; } static void __exit i2c_exit(void) { if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_unregister(&i2c_acpi_notifier)); if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_unregister(&i2c_of_notifier)); i2c_del_driver(&dummy_driver); debugfs_remove_recursive(i2c_debugfs_root); bus_unregister(&i2c_bus_type); tracepoint_synchronize_unregister(); } /* We must initialize early, because some subsystems register i2c drivers * in subsys_initcall() code, but are linked (and initialized) before i2c. */ postcore_initcall(i2c_init); module_exit(i2c_exit); /* ---------------------------------------------------- * the functional interface to the i2c busses. * ---------------------------------------------------- */ /* Check if val is exceeding the quirk IFF quirk is non 0 */ #define i2c_quirk_exceeded(val, quirk) ((quirk) && ((val) > (quirk))) static int i2c_quirk_error(struct i2c_adapter *adap, struct i2c_msg *msg, char *err_msg) { dev_err_ratelimited(&adap->dev, "adapter quirk: %s (addr 0x%04x, size %u, %s)\n", err_msg, msg->addr, msg->len, str_read_write(msg->flags & I2C_M_RD)); return -EOPNOTSUPP; } static int i2c_check_for_quirks(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { const struct i2c_adapter_quirks *q = adap->quirks; int max_num = q->max_num_msgs, i; bool do_len_check = true; if (q->flags & I2C_AQ_COMB) { max_num = 2; /* special checks for combined messages */ if (num == 2) { if (q->flags & I2C_AQ_COMB_WRITE_FIRST && msgs[0].flags & I2C_M_RD) return i2c_quirk_error(adap, &msgs[0], "1st comb msg must be write"); if (q->flags & I2C_AQ_COMB_READ_SECOND && !(msgs[1].flags & I2C_M_RD)) return i2c_quirk_error(adap, &msgs[1], "2nd comb msg must be read"); if (q->flags & I2C_AQ_COMB_SAME_ADDR && msgs[0].addr != msgs[1].addr) return i2c_quirk_error(adap, &msgs[0], "comb msg only to same addr"); if (i2c_quirk_exceeded(msgs[0].len, q->max_comb_1st_msg_len)) return i2c_quirk_error(adap, &msgs[0], "msg too long"); if (i2c_quirk_exceeded(msgs[1].len, q->max_comb_2nd_msg_len)) return i2c_quirk_error(adap, &msgs[1], "msg too long"); do_len_check = false; } } if (i2c_quirk_exceeded(num, max_num)) return i2c_quirk_error(adap, &msgs[0], "too many messages"); for (i = 0; i < num; i++) { u16 len = msgs[i].len; if (msgs[i].flags & I2C_M_RD) { if (do_len_check && i2c_quirk_exceeded(len, q->max_read_len)) return i2c_quirk_error(adap, &msgs[i], "msg too long"); if (q->flags & I2C_AQ_NO_ZERO_LEN_READ && len == 0) return i2c_quirk_error(adap, &msgs[i], "no zero length"); } else { if (do_len_check && i2c_quirk_exceeded(len, q->max_write_len)) return i2c_quirk_error(adap, &msgs[i], "msg too long"); if (q->flags & I2C_AQ_NO_ZERO_LEN_WRITE && len == 0) return i2c_quirk_error(adap, &msgs[i], "no zero length"); } } return 0; } /** * __i2c_transfer - unlocked flavor of i2c_transfer * @adap: Handle to I2C bus * @msgs: One or more messages to execute before STOP is issued to * terminate the operation; each message begins with a START. * @num: Number of messages to be executed. * * Returns negative errno, else the number of messages executed. * * Adapter lock must be held when calling this function. No debug logging * takes place. */ int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { unsigned long orig_jiffies; int ret, try; if (!adap->algo->master_xfer) { dev_dbg(&adap->dev, "I2C level transfers not supported\n"); return -EOPNOTSUPP; } if (WARN_ON(!msgs || num < 1)) return -EINVAL; ret = __i2c_check_suspended(adap); if (ret) return ret; if (adap->quirks && i2c_check_for_quirks(adap, msgs, num)) return -EOPNOTSUPP; /* * i2c_trace_msg_key gets enabled when tracepoint i2c_transfer gets * enabled. This is an efficient way of keeping the for-loop from * being executed when not needed. */ if (static_branch_unlikely(&i2c_trace_msg_key)) { int i; for (i = 0; i < num; i++) if (msgs[i].flags & I2C_M_RD) trace_i2c_read(adap, &msgs[i], i); else trace_i2c_write(adap, &msgs[i], i); } /* Retry automatically on arbitration loss */ orig_jiffies = jiffies; for (ret = 0, try = 0; try <= adap->retries; try++) { if (i2c_in_atomic_xfer_mode() && adap->algo->master_xfer_atomic) ret = adap->algo->master_xfer_atomic(adap, msgs, num); else ret = adap->algo->master_xfer(adap, msgs, num); if (ret != -EAGAIN) break; if (time_after(jiffies, orig_jiffies + adap->timeout)) break; } if (static_branch_unlikely(&i2c_trace_msg_key)) { int i; for (i = 0; i < ret; i++) if (msgs[i].flags & I2C_M_RD) trace_i2c_reply(adap, &msgs[i], i); trace_i2c_result(adap, num, ret); } return ret; } EXPORT_SYMBOL(__i2c_transfer); /** * i2c_transfer - execute a single or combined I2C message * @adap: Handle to I2C bus * @msgs: One or more messages to execute before STOP is issued to * terminate the operation; each message begins with a START. * @num: Number of messages to be executed. * * Returns negative errno, else the number of messages executed. * * Note that there is no requirement that each message be sent to * the same slave address, although that is the most common model. */ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) { int ret; /* REVISIT the fault reporting model here is weak: * * - When we get an error after receiving N bytes from a slave, * there is no way to report "N". * * - When we get a NAK after transmitting N bytes to a slave, * there is no way to report "N" ... or to let the master * continue executing the rest of this combined message, if * that's the appropriate response. * * - When for example "num" is two and we successfully complete * the first message but get an error part way through the * second, it's unclear whether that should be reported as * one (discarding status on the second message) or errno * (discarding status on the first one). */ ret = __i2c_lock_bus_helper(adap); if (ret) return ret; ret = __i2c_transfer(adap, msgs, num); i2c_unlock_bus(adap, I2C_LOCK_SEGMENT); return ret; } EXPORT_SYMBOL(i2c_transfer); /** * i2c_transfer_buffer_flags - issue a single I2C message transferring data * to/from a buffer * @client: Handle to slave device * @buf: Where the data is stored * @count: How many bytes to transfer, must be less than 64k since msg.len is u16 * @flags: The flags to be used for the message, e.g. I2C_M_RD for reads * * Returns negative errno, or else the number of bytes transferred. */ int i2c_transfer_buffer_flags(const struct i2c_client *client, char *buf, int count, u16 flags) { int ret; struct i2c_msg msg = { .addr = client->addr, .flags = flags | (client->flags & I2C_M_TEN), .len = count, .buf = buf, }; ret = i2c_transfer(client->adapter, &msg, 1); /* * If everything went ok (i.e. 1 msg transferred), return #bytes * transferred, else error code. */ return (ret == 1) ? count : ret; } EXPORT_SYMBOL(i2c_transfer_buffer_flags); /** * i2c_get_device_id - get manufacturer, part id and die revision of a device * @client: The device to query * @id: The queried information * * Returns negative errno on error, zero on success. */ int i2c_get_device_id(const struct i2c_client *client, struct i2c_device_identity *id) { struct i2c_adapter *adap = client->adapter; union i2c_smbus_data raw_id; int ret; if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_I2C_BLOCK)) return -EOPNOTSUPP; raw_id.block[0] = 3; ret = i2c_smbus_xfer(adap, I2C_ADDR_DEVICE_ID, 0, I2C_SMBUS_READ, client->addr << 1, I2C_SMBUS_I2C_BLOCK_DATA, &raw_id); if (ret) return ret; id->manufacturer_id = (raw_id.block[1] << 4) | (raw_id.block[2] >> 4); id->part_id = ((raw_id.block[2] & 0xf) << 5) | (raw_id.block[3] >> 3); id->die_revision = raw_id.block[3] & 0x7; return 0; } EXPORT_SYMBOL_GPL(i2c_get_device_id); /** * i2c_client_get_device_id - get the driver match table entry of a device * @client: the device to query. The device must be bound to a driver * * Returns a pointer to the matching entry if found, NULL otherwise. */ const struct i2c_device_id *i2c_client_get_device_id(const struct i2c_client *client) { const struct i2c_driver *drv = to_i2c_driver(client->dev.driver); return i2c_match_id(drv->id_table, client); } EXPORT_SYMBOL_GPL(i2c_client_get_device_id); /* ---------------------------------------------------- * the i2c address scanning function * Will not work for 10-bit addresses! * ---------------------------------------------------- */ /* * Legacy default probe function, mostly relevant for SMBus. The default * probe method is a quick write, but it is known to corrupt the 24RF08 * EEPROMs due to a state machine bug, and could also irreversibly * write-protect some EEPROMs, so for address ranges 0x30-0x37 and 0x50-0x5f, * we use a short byte read instead. Also, some bus drivers don't implement * quick write, so we fallback to a byte read in that case too. * On x86, there is another special case for FSC hardware monitoring chips, * which want regular byte reads (address 0x73.) Fortunately, these are the * only known chips using this I2C address on PC hardware. * Returns 1 if probe succeeded, 0 if not. */ static int i2c_default_probe(struct i2c_adapter *adap, unsigned short addr) { int err; union i2c_smbus_data dummy; #ifdef CONFIG_X86 if (addr == 0x73 && (adap->class & I2C_CLASS_HWMON) && i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE_DATA)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE_DATA, &dummy); else #endif if (!((addr & ~0x07) == 0x30 || (addr & ~0x0f) == 0x50) && i2c_check_functionality(adap, I2C_FUNC_SMBUS_QUICK)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_WRITE, 0, I2C_SMBUS_QUICK, NULL); else if (i2c_check_functionality(adap, I2C_FUNC_SMBUS_READ_BYTE)) err = i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_BYTE, &dummy); else { dev_warn(&adap->dev, "No suitable probing method supported for address 0x%02X\n", addr); err = -EOPNOTSUPP; } return err >= 0; } static int i2c_detect_address(struct i2c_client *temp_client, struct i2c_driver *driver) { struct i2c_board_info info; struct i2c_adapter *adapter = temp_client->adapter; int addr = temp_client->addr; int err; /* Make sure the address is valid */ err = i2c_check_7bit_addr_validity_strict(addr); if (err) { dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n", addr); return err; } /* Skip if already in use (7 bit, no need to encode flags) */ if (i2c_check_addr_busy(adapter, addr)) return 0; /* Make sure there is something at this address */ if (!i2c_default_probe(adapter, addr)) return 0; /* Finally call the custom detection function */ memset(&info, 0, sizeof(struct i2c_board_info)); info.addr = addr; err = driver->detect(temp_client, &info); if (err) { /* -ENODEV is returned if the detection fails. We catch it here as this isn't an error. */ return err == -ENODEV ? 0 : err; } /* Consistency check */ if (info.type[0] == '\0') { dev_err(&adapter->dev, "%s detection function provided no name for 0x%x\n", driver->driver.name, addr); } else { struct i2c_client *client; /* Detection succeeded, instantiate the device */ if (adapter->class & I2C_CLASS_DEPRECATED) dev_warn(&adapter->dev, "This adapter will soon drop class based instantiation of devices. " "Please make sure client 0x%02x gets instantiated by other means. " "Check 'Documentation/i2c/instantiating-devices.rst' for details.\n", info.addr); dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n", info.type, info.addr); client = i2c_new_client_device(adapter, &info); if (!IS_ERR(client)) list_add_tail(&client->detected, &driver->clients); else dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n", info.type, info.addr); } return 0; } static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) { const unsigned short *address_list; struct i2c_client *temp_client; int i, err = 0; address_list = driver->address_list; if (!driver->detect || !address_list) return 0; /* Warn that the adapter lost class based instantiation */ if (adapter->class == I2C_CLASS_DEPRECATED) { dev_dbg(&adapter->dev, "This adapter dropped support for I2C classes and won't auto-detect %s devices anymore. " "If you need it, check 'Documentation/i2c/instantiating-devices.rst' for alternatives.\n", driver->driver.name); return 0; } /* Stop here if the classes do not match */ if (!(adapter->class & driver->class)) return 0; /* Set up a temporary client to help detect callback */ temp_client = kzalloc_obj(*temp_client); if (!temp_client) return -ENOMEM; temp_client->adapter = adapter; for (i = 0; address_list[i] != I2C_CLIENT_END; i += 1) { dev_dbg(&adapter->dev, "found normal entry for adapter %d, addr 0x%02x\n", i2c_adapter_id(adapter), address_list[i]); temp_client->addr = address_list[i]; err = i2c_detect_address(temp_client, driver); if (unlikely(err)) break; } kfree(temp_client); return err; } int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr) { return i2c_smbus_xfer(adap, addr, 0, I2C_SMBUS_READ, 0, I2C_SMBUS_QUICK, NULL) >= 0; } EXPORT_SYMBOL_GPL(i2c_probe_func_quick_read); struct i2c_client * i2c_new_scanned_device(struct i2c_adapter *adap, struct i2c_board_info *info, unsigned short const *addr_list, int (*probe)(struct i2c_adapter *adap, unsigned short addr)) { int i; if (!probe) probe = i2c_default_probe; for (i = 0; addr_list[i] != I2C_CLIENT_END; i++) { /* Check address validity */ if (i2c_check_7bit_addr_validity_strict(addr_list[i]) < 0) { dev_warn(&adap->dev, "Invalid 7-bit address 0x%02x\n", addr_list[i]); continue; } /* Check address availability (7 bit, no need to encode flags) */ if (i2c_check_addr_busy(adap, addr_list[i])) { dev_dbg(&adap->dev, "Address 0x%02x already in use, not probing\n", addr_list[i]); continue; } /* Test address responsiveness */ if (probe(adap, addr_list[i])) break; } if (addr_list[i] == I2C_CLIENT_END) { dev_dbg(&adap->dev, "Probing failed, no device found\n"); return ERR_PTR(-ENODEV); } info->addr = addr_list[i]; return i2c_new_client_device(adap, info); } EXPORT_SYMBOL_GPL(i2c_new_scanned_device); struct i2c_adapter *i2c_get_adapter(int nr) { struct i2c_adapter *adapter; mutex_lock(&core_lock); adapter = idr_find(&i2c_adapter_idr, nr); if (!adapter) goto exit; if (try_module_get(adapter->owner)) get_device(&adapter->dev); else adapter = NULL; exit: mutex_unlock(&core_lock); return adapter; } EXPORT_SYMBOL(i2c_get_adapter); void i2c_put_adapter(struct i2c_adapter *adap) { if (!adap) return; module_put(adap->owner); /* Should be last, otherwise we risk use-after-free with 'adap' */ put_device(&adap->dev); } EXPORT_SYMBOL(i2c_put_adapter); /** * i2c_get_dma_safe_msg_buf() - get a DMA safe buffer for the given i2c_msg * @msg: the message to be checked * @threshold: the minimum number of bytes for which using DMA makes sense. * Should at least be 1. * * Return: NULL if a DMA safe buffer was not obtained. Use msg->buf with PIO. * Or a valid pointer to be used with DMA. After use, release it by * calling i2c_put_dma_safe_msg_buf(). * * This function must only be called from process context! */ u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold) { /* also skip 0-length msgs for bogus thresholds of 0 */ if (!threshold) pr_debug("DMA buffer for addr=0x%02x with length 0 is bogus\n", msg->addr); if (msg->len < threshold || msg->len == 0) return NULL; if (msg->flags & I2C_M_DMA_SAFE) return msg->buf; pr_debug("using bounce buffer for addr=0x%02x, len=%d\n", msg->addr, msg->len); if (msg->flags & I2C_M_RD) return kzalloc(msg->len, GFP_KERNEL); else return kmemdup(msg->buf, msg->len, GFP_KERNEL); } EXPORT_SYMBOL_GPL(i2c_get_dma_safe_msg_buf); /** * i2c_put_dma_safe_msg_buf - release DMA safe buffer and sync with i2c_msg * @buf: the buffer obtained from i2c_get_dma_safe_msg_buf(). May be NULL. * @msg: the message which the buffer corresponds to * @xferred: bool saying if the message was transferred */ void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred) { if (!buf || buf == msg->buf) return; if (xferred && msg->flags & I2C_M_RD) memcpy(msg->buf, buf, msg->len); kfree(buf); } EXPORT_SYMBOL_GPL(i2c_put_dma_safe_msg_buf); MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>"); MODULE_DESCRIPTION("I2C-Bus main module"); MODULE_LICENSE("GPL");
432 391 108 78 30 107 382 42 769 18 18 19 3 13 18 18 212 213 212 832 294 151 453 35 119 404 404 453 391 12 474 474 5 5 5 5 5 5 3 9 4 18 2 17 5 23 22 4 14 1 5 1 1 5 5 11 5 3 19 2 17 3 13 9 3 4 3 4 4 3 34 34 5 5 20 25 2 2 4 2 13 2 188 1 4 2 1 179 17 3 10 6 22 13 17 11 14 2 37 13 44 84 90 13 1 170 2 157 2 11 7 1 154 8 21 12 5 4 1 1 4 168 15 149 3 16 16 3 13 5 10 11 2 11 9 11 16 258 257 137 145 138 138 2 24 124 6 27 107 12 103 9 105 2 2 144 3 144 3 141 117 27 27 1 1 27 27 4 144 143 128 64 7 1 2 4 2 7 10 2 1 7 7 365 253 213 175 54 142 19 1126 167 1127 621 1040 161 261 185 45 62 133 16 15 16 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto e_inval; } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; goto e_inval; } } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -reason; e_rpf: return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; /* Within the same container, it is regarded as a martian source, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) return -SKB_DROP_REASON_IP_LOCAL_SOURCE; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_net_unlock(net); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); if (!inet_validate_dscp(rtm->rtm_tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); err = -EINVAL; goto errout; } cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_dst_len > 32) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); err = -EINVAL; goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); err = -EINVAL; goto errout; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); unlock: rtnl_net_unlock(net); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; unlock: rtnl_net_unlock(net); errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (filter->rtnl_held) ASSERT_RTNL(); rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); if (filter->rtnl_held) filter->dev = __dev_get_by_index(net, ifindex); else filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, .rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); err = -ENOENT; goto unlock; } err = fib_table_dump(tb, skb, cb, &filter); goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) goto out; dumped = 1; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(net); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = fib4_semantics_init(net); if (error) goto out_semantics; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: fib4_semantics_exit(net); out_semantics: rtnl_net_lock(net); ip_fib_net_exit(net); rtnl_net_unlock(net); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); } static void __net_exit fib_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { __rtnl_net_lock(net); ip_fib_net_exit(net); __rtnl_net_unlock(net); } rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, .exit_batch = fib_net_exit_batch, }; static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register_many(fib_rtnl_msg_handlers); }
15 15 13 15 15 14 15 13 15 1 15 14 15 15 15 1 15 15 14 15 15 15 15 15 15 15 14 15 15 15 1 14 13 13 13 15 15 15 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Design * ====== * * See https://www.chronox.de/jent.html * * License * ======= * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * This Jitterentropy RNG is based on the jitterentropy library * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c." #endif typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { /* SHA3-256 is used as conditioner */ #define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ void *hash_state; /* SENSITIVE hash state entropy pool */ __u64 prev_time; /* SENSITIVE Previous time stamp */ __u64 last_delta; /* SENSITIVE stuck test */ __s64 last_delta2; /* SENSITIVE stuck test */ unsigned int flags; /* Flags used to initialize */ unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_ACCESSLOOPS 128 #define JENT_MEMORY_SIZE \ (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS * \ CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE) unsigned char *mem; /* Memory access location with size of * memblocks * memblocksize */ unsigned int memlocation; /* Pointer to byte in *mem */ unsigned int memblocks; /* Number of memory blocks in *mem */ unsigned int memblocksize; /* Size of one memory block in bytes */ unsigned int memaccessloops; /* Number of memory accesses per random * bit generation */ /* Repetition Count Test */ unsigned int rct_count; /* Number of stuck values */ /* Adaptive Proportion Test cutoff values */ unsigned int apt_cutoff; /* Intermittent health test failure */ unsigned int apt_cutoff_permanent; /* Permanent health test failure */ #define JENT_APT_WINDOW_SIZE 512 /* Data window size */ /* LSB of time stamp to process */ #define JENT_APT_LSB 16 #define JENT_APT_WORD_MASK (JENT_APT_LSB - 1) unsigned int apt_observations; /* Number of collected observations */ unsigned int apt_count; /* APT counter */ unsigned int apt_base; /* APT base reference */ unsigned int health_failure; /* Record health failure */ unsigned int apt_base_set:1; /* APT base reference set? */ }; /* Flags that can be used to initialize the RNG */ #define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more * entropy, saves MEMORY_SIZE RAM for * entropy collector */ /* -- error codes for init function -- */ #define JENT_ENOTIME 1 /* Timer service not available */ #define JENT_ECOARSETIME 2 /* Timer too coarse for RNG */ #define JENT_ENOMONOTONIC 3 /* Timer is not monotonic increasing */ #define JENT_EVARVAR 5 /* Timer does not produce variations of * variations (2nd derivation of time is * zero). */ #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ #define JENT_EHASH 11 /* Hash self test failed */ #define JENT_EMEM 12 /* Can't allocate memory for initialization */ #define JENT_RCT_FAILURE 1 /* Failure in RCT health test. */ #define JENT_APT_FAILURE 2 /* Failure in APT health test. */ #define JENT_PERMANENT_FAILURE_SHIFT 16 #define JENT_PERMANENT_FAILURE(x) (x << JENT_PERMANENT_FAILURE_SHIFT) #define JENT_RCT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE) #define JENT_APT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_APT_FAILURE) /* * The output n bits can receive more than n bits of min entropy, of course, * but the fixed output of the conditioning function can only asymptotically * approach the output size bits of min entropy, not attain that bound. Random * maps will tend to have output collisions, which reduces the creditable * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). * * The value "64" is justified in Appendix A.4 of the current 90C draft, * and aligns with NIST's in "epsilon" definition in this document, which is * that a string can be considered "full entropy" if you can bound the min * entropy in each bit of output to at least 1-epsilon, where epsilon is * required to be <= 2^(-32). */ #define JENT_ENTROPY_SAFETY_FACTOR 64 #include <linux/array_size.h> #include <linux/fips.h> #include <linux/minmax.h> #include "jitterentropy.h" /*************************************************************************** * Adaptive Proportion Test * * This test complies with SP800-90B section 4.4.2. ***************************************************************************/ /* * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B * APT. * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)). * (The original formula wasn't correct because the first symbol must * necessarily have been observed, so there is no chance of observing 0 of these * symbols.) * * For the alpha < 2^-53, R cannot be used as it uses a float data type without * arbitrary precision. A SageMath script is used to calculate those cutoff * values. * * For any value above 14, this yields the maximal allowable value of 512 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that * renders the test unable to fail). */ static const unsigned int jent_apt_cutoff_lookup[15] = { 325, 422, 459, 477, 488, 494, 499, 502, 505, 507, 508, 509, 510, 511, 512 }; static const unsigned int jent_apt_cutoff_permanent_lookup[15] = { 355, 447, 479, 494, 502, 507, 510, 512, 512, 512, 512, 512, 512, 512, 512 }; static void jent_apt_init(struct rand_data *ec, unsigned int osr) { /* * Establish the apt_cutoff based on the presumed entropy rate of * 1/osr. */ if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) { ec->apt_cutoff = jent_apt_cutoff_lookup[ ARRAY_SIZE(jent_apt_cutoff_lookup) - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[ ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1]; } else { ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[osr - 1]; } } /* * Reset the APT counter * * @ec [in] Reference to entropy collector */ static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked) { /* Reset APT counter */ ec->apt_count = 0; ec->apt_base = delta_masked; ec->apt_observations = 0; } /* * Insert a new entropy event into APT * * @ec [in] Reference to entropy collector * @delta_masked [in] Masked time delta to process */ static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked) { /* Initialize the base reference */ if (!ec->apt_base_set) { ec->apt_base = delta_masked; ec->apt_base_set = 1; return; } if (delta_masked == ec->apt_base) { ec->apt_count++; /* Note, ec->apt_count starts with one. */ if (ec->apt_count >= ec->apt_cutoff_permanent) ec->health_failure |= JENT_APT_FAILURE_PERMANENT; else if (ec->apt_count >= ec->apt_cutoff) ec->health_failure |= JENT_APT_FAILURE; } ec->apt_observations++; if (ec->apt_observations >= JENT_APT_WINDOW_SIZE) jent_apt_reset(ec, delta_masked); } /*************************************************************************** * Stuck Test and its use as Repetition Count Test * * The Jitter RNG uses an enhanced version of the Repetition Count Test * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical * back-to-back values, the input to the RCT is the counting of the stuck * values during the generation of one Jitter RNG output block. * * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8. * * During the counting operation, the Jitter RNG always calculates the RCT * cut-off value of C. If that value exceeds the allowed cut-off value, * the Jitter RNG output block will be calculated completely but discarded at * the end. The caller of the Jitter RNG is informed with an error code. ***************************************************************************/ /* * Repetition Count Test as defined in SP800-90B section 4.4.1 * * @ec [in] Reference to entropy collector * @stuck [in] Indicator whether the value is stuck */ static void jent_rct_insert(struct rand_data *ec, int stuck) { if (stuck) { ec->rct_count++; /* * The cutoff value is based on the following consideration: * alpha = 2^-30 or 2^-60 as recommended in SP800-90B. * In addition, we require an entropy value H of 1/osr as this * is the minimum entropy required to provide full entropy. * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr * deltas for inserting them into the entropy pool which should * then have (close to) DATA_SIZE_BITS bits of entropy in the * conditioned output. * * Note, ec->rct_count (which equals to value B in the pseudo * code of SP800-90B section 4.4.1) starts with zero. Hence * we need to subtract one from the cutoff value as calculated * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr * or 60*osr. */ if ((unsigned int)ec->rct_count >= (60 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE_PERMANENT; } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE; } } else { /* Reset RCT */ ec->rct_count = 0; } } static inline __u64 jent_delta(__u64 prev, __u64 next) { #define JENT_UINT64_MAX (__u64)(~((__u64) 0)) return (prev < next) ? (next - prev) : (JENT_UINT64_MAX - prev + 1 + next); } /* * Stuck test by checking the: * 1st derivative of the jitter measurement (time delta) * 2nd derivative of the jitter measurement (delta of time deltas) * 3rd derivative of the jitter measurement (delta of delta of time deltas) * * All values must always be non-zero. * * @ec [in] Reference to entropy collector * @current_delta [in] Jitter time delta * * @return * 0 jitter measurement not stuck (good bit) * 1 jitter measurement stuck (reject bit) */ static int jent_stuck(struct rand_data *ec, __u64 current_delta) { __u64 delta2 = jent_delta(ec->last_delta, current_delta); __u64 delta3 = jent_delta(ec->last_delta2, delta2); ec->last_delta = current_delta; ec->last_delta2 = delta2; /* * Insert the result of the comparison of two back-to-back time * deltas. */ jent_apt_insert(ec, current_delta); if (!current_delta || !delta2 || !delta3) { /* RCT with a stuck bit */ jent_rct_insert(ec, 1); return 1; } /* RCT with a non-stuck bit */ jent_rct_insert(ec, 0); return 0; } /* * Report any health test failures * * @ec [in] Reference to entropy collector * * @return a bitmask indicating which tests failed * 0 No health test failure * 1 RCT failure * 2 APT failure * 1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure * 2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure */ static unsigned int jent_health_failure(struct rand_data *ec) { /* Test is only enabled in FIPS mode */ if (!fips_enabled) return 0; return ec->health_failure; } /*************************************************************************** * Noise sources ***************************************************************************/ /* * Update of the loop count used for the next round of * an entropy collection. * * Input: * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; unsigned int i = 0; unsigned int mask = (1<<bits) - 1; jent_get_nstime(&time); /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. */ for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) { shuffle ^= time & mask; time = time >> bits; } /* * We add a lower boundary value to ensure we have a minimum * RNG loop count. */ return (shuffle + (1<<min)); } /* * CPU Jitter noise source -- this is the noise source based on the CPU * execution time jitter * * This function injects the individual bits of the time value into the * entropy pool using a hash. * * ec [in] entropy collector * time [in] time stamp to be injected * stuck [in] Is the time stamp identified as stuck? * * Output: * updated hash context in the entropy collector or error code */ static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { #define SHA3_HASH_LOOP (1<<3) struct { int rct_count; unsigned int apt_observations; unsigned int apt_count; unsigned int apt_base; } addtl = { ec->rct_count, ec->apt_observations, ec->apt_count, ec->apt_base }; return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), SHA3_HASH_LOOP, stuck); } /* * Memory Access noise source -- this is a noise source based on variations in * memory access times * * This function performs memory accesses which will add to the timing * variations due to an unknown amount of CPU wait states that need to be * added when accessing memory. The memory size should be larger than the L1 * caches as outlined in the documentation and the associated testing. * * The L1 cache has a very high bandwidth, albeit its access rate is usually * slower than accessing CPU registers. Therefore, L1 accesses only add minimal * variations as the CPU has hardly to wait. Starting with L2, significant * variations are added because L2 typically does not belong to the CPU any more * and therefore a wider range of CPU wait states is necessary for accesses. * L3 and real memory accesses have even a wider range of wait states. However, * to reliably access either L3 or memory, the ec->mem memory must be quite * large which is usually not desirable. * * @ec [in] Reference to the entropy collector with the memory access data -- if * the reference to the memory block to be accessed is NULL, this noise * source is disabled * @loop_cnt [in] if a value not equal to 0 is set, use the given value * number of loops to perform the LFSR */ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) { unsigned int wrap = 0; __u64 i = 0; #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; wrap = ec->memblocksize * ec->memblocks; /* * testing purposes -- allow test app to set the counter, not * needed during runtime */ if (loop_cnt) acc_loop_cnt = loop_cnt; for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) { unsigned char *tmpval = ec->mem + ec->memlocation; /* * memory access: just add 1 to one byte, * wrap at 255 -- memory access implies read * from and write to memory location */ *tmpval = (*tmpval + 1) & 0xff; /* * Addition of memblocksize - 1 to pointer * with wrap around logic to ensure that every * memory location is hit evenly */ ec->memlocation = ec->memlocation + ec->memblocksize - 1; ec->memlocation = ec->memlocation % wrap; } } /*************************************************************************** * Start of entropy processing logic ***************************************************************************/ /* * This is the heart of the entropy generation: calculate time deltas and * use the CPU jitter in the time deltas. The jitter is injected into the * entropy pool. * * WARNING: ensure that ->prev_time is primed before using the output * of this function! This can be done by calling this function * and not using its result. * * @ec [in] Reference to entropy collector * * @return result of stuck test */ static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta) { __u64 time = 0; __u64 current_delta = 0; int stuck; /* Invoke one noise source before time measurement to add variations */ jent_memaccess(ec, 0); /* * Get time stamp and calculate time delta to previous * invocation to measure the timing variations */ jent_get_nstime(&time); current_delta = jent_delta(ec->prev_time, time); ec->prev_time = time; /* Check whether we have a stuck measurement. */ stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ if (jent_condition_data(ec, current_delta, stuck)) stuck = 1; /* return the raw entropy value */ if (ret_current_delta) *ret_current_delta = current_delta; return stuck; } /* * Generator of one 64 bit random number * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { unsigned int k = 0, safety_factor = 0; if (fips_enabled) safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec, NULL); while (!jent_health_failure(ec)) { /* If a stuck measurement is received, repeat measurement */ if (jent_measure_jitter(ec, NULL)) continue; /* * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } /* * Entry function: Obtain entropy for the caller. * * This function invokes the entropy gathering logic as often to generate * as many bytes as requested by the caller. The entropy gathering logic * creates 64 bit per invocation. * * This function truncates the last 64 bit entropy value output to the exact * size specified by the caller. * * @ec [in] Reference to entropy collector * @data [in] pointer to buffer for storing random data -- buffer must already * exist * @len [in] size of the buffer, specifying also the requested number of random * in bytes * * @return 0 when request is fulfilled or an error * * The following error codes can occur: * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len) { unsigned char *p = data; if (!ec) return -1; while (len > 0) { unsigned int tocopy, health_test_result; jent_gen_entropy(ec); health_test_result = jent_health_failure(ec); if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) { /* * At this point, the Jitter RNG instance is considered * as a failed instance. There is no rerun of the * startup test any more, because the caller * is assumed to not further use this instance. */ return -3; } else if (health_test_result) { /* * Perform startup health tests and return permanent * error if it fails. */ if (jent_entropy_init(0, 0, NULL, ec)) { /* Mark the permanent error */ ec->health_failure &= JENT_RCT_FAILURE_PERMANENT | JENT_APT_FAILURE_PERMANENT; return -3; } return -2; } tocopy = min(DATA_SIZE_BITS / 8, len); if (jent_read_random_block(ec->hash_state, p, tocopy)) return -1; len -= tocopy; p += tocopy; } return 0; } /*************************************************************************** * Initialization logic ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, unsigned int flags, void *hash_state) { struct rand_data *entropy_collector; entropy_collector = jent_zalloc(sizeof(struct rand_data)); if (!entropy_collector) return NULL; if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) { /* Allocate memory for adding variations based on memory * access */ entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE); if (!entropy_collector->mem) { jent_zfree(entropy_collector); return NULL; } entropy_collector->memblocksize = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE; entropy_collector->memblocks = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS; entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS; } /* verify and set the oversampling rate */ if (osr == 0) osr = 1; /* H_submitter = 1 / osr */ entropy_collector->osr = osr; entropy_collector->flags = flags; entropy_collector->hash_state = hash_state; /* Initialize the APT */ jent_apt_init(entropy_collector, osr); /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); return entropy_collector; } void jent_entropy_collector_free(struct rand_data *entropy_collector) { jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE); entropy_collector->mem = NULL; jent_zfree(entropy_collector); } int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state, struct rand_data *p_ec) { /* * If caller provides an allocated ec, reuse it which implies that the * health test entropy data is used to further still the available * entropy pool. */ struct rand_data *ec = p_ec; int i, time_backwards = 0, ret = 0, ec_free = 0; unsigned int health_test_result; if (!ec) { ec = jent_entropy_collector_alloc(osr, flags, hash_state); if (!ec) return JENT_EMEM; ec_free = 1; } else { /* Reset the APT */ jent_apt_reset(ec, 0); /* Ensure that a new APT base is obtained */ ec->apt_base_set = 0; /* Reset the RCT */ ec->rct_count = 0; /* Reset intermittent, leave permanent health test result */ ec->health_failure &= (~JENT_RCT_FAILURE); ec->health_failure &= (~JENT_APT_FAILURE); } /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These * loop counts may show some slight skew and we produce * false positives. * * Moreover, only old systems show potentially problematic * jitter entropy that could potentially be caught here. But * the RNG is intended for hardware that is available or widely * used, but not old systems that are long out of favor. Thus, * no statistical tests. */ /* * We could add a check for system capabilities such as clock_getres or * check for CONFIG_X86_TSC, but it does not make much sense as the * following sanity checks verify that we have a high-resolution * timer. */ /* * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is * definitely too little. * * SP800-90B requires at least 1024 initial test cycles. */ #define TESTLOOPCOUNT 1024 #define CLEARCACHE 100 for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) { __u64 start_time = 0, end_time = 0, delta = 0; /* Invoke core entropy collection logic */ jent_measure_jitter(ec, &delta); end_time = ec->prev_time; start_time = ec->prev_time - delta; /* test whether timer works */ if (!start_time || !end_time) { ret = JENT_ENOTIME; goto out; } /* * test whether timer is fine grained enough to provide * delta even when called shortly after each other -- this * implies that we also have a high resolution timer */ if (!delta || (end_time == start_time)) { ret = JENT_ECOARSETIME; goto out; } /* * up to here we did not modify any variable that will be * evaluated later, but we already performed some work. Thus we * already have had an impact on the caches, branch prediction, * etc. with the goal to clear it to get the worst case * measurements. */ if (i < CLEARCACHE) continue; /* test whether we have an increasing timer */ if (!(end_time > start_time)) time_backwards++; } /* * we allow up to three times the time running backwards. * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus, * if such an operation just happens to interfere with our test, it * should not fail. The value of 3 should cover the NTP case being * performed during our test run. */ if (time_backwards > 3) { ret = JENT_ENOMONOTONIC; goto out; } /* Did we encounter a health test failure? */ health_test_result = jent_health_failure(ec); if (health_test_result) { ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT : JENT_EHEALTH; goto out; } out: if (ec_free) jent_entropy_collector_free(ec); return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __PSP_PSP_H #define __PSP_PSP_H #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <net/netns/generic.h> #include <net/psp.h> #include <net/sock.h> extern struct xarray psp_devs; extern struct mutex psp_devs_lock; void psp_dev_free(struct psp_dev *psd); int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); struct psp_assoc *psp_assoc_create(struct psp_dev *psd); struct psp_dev *psp_dev_get_for_sock(struct sock *sk); void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, struct psp_key_parsed *key, struct netlink_ext_ack *extack); int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } static inline bool psp_dev_tryget(struct psp_dev *psd) { return refcount_inc_not_zero(&psd->refcnt); } static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_free(psd); } static inline bool psp_dev_is_registered(struct psp_dev *psd) { lockdep_assert_held(&psd->lock); return !!psd->ops; } #endif /* __PSP_PSP_H */
15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 14 14 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 14 15 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Fredy Neeser */ /* Greg Joyce <greg@opengridcomputing.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ /* Copyright (c) 2017, Open Grid Computing, Inc. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/tcp.h> #include <linux/inet.h> #include <linux/tcp.h> #include <trace/events/sock.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include "siw.h" #include "siw_cm.h" /* * Set to any combination of * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR */ static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; static const bool relaxed_ird_negotiation = true; static void siw_cm_llp_state_change(struct sock *s); static void siw_cm_llp_data_ready(struct sock *s); static void siw_cm_llp_write_space(struct sock *s); static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * lockdep can detect false positive circular dependencies * when there are user-space socket API users or in kernel * users switching between a tcp and rdma transport. * Maybe also switching between siw and rxe may cause * problems as per default sockets are only classified * by family and not by ip protocol. And there might * be different locks used between the application * and the low level sockets. * * Problems were seen with ksmbd.ko and cifs.ko, * switching transports, use git blame to find * more details. */ static struct lock_class_key siw_sk_key[2]; static struct lock_class_key siw_slock_key[2]; #endif /* CONFIG_DEBUG_LOCK_ALLOC */ static inline void siw_reclassify_socket(struct socket *sock) { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct sock *sk = sock->sk; if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) return; switch (sk->sk_family) { case AF_INET: sock_lock_init_class_and_name(sk, "slock-AF_INET-RDMA-SIW", &siw_slock_key[0], "sk_lock-AF_INET-RDMA-SIW", &siw_sk_key[0]); break; case AF_INET6: sock_lock_init_class_and_name(sk, "slock-AF_INET6-RDMA-SIW", &siw_slock_key[1], "sk_lock-AF_INET6-RDMA-SIW", &siw_sk_key[1]); break; default: WARN_ON_ONCE(1); } #endif /* CONFIG_DEBUG_LOCK_ALLOC */ } static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); write_lock_bh(&sk->sk_callback_lock); cep->sk_state_change = sk->sk_state_change; cep->sk_data_ready = sk->sk_data_ready; cep->sk_write_space = sk->sk_write_space; cep->sk_error_report = sk->sk_error_report; sk->sk_state_change = siw_cm_llp_state_change; sk->sk_data_ready = siw_cm_llp_data_ready; sk->sk_write_space = siw_cm_llp_write_space; sk->sk_error_report = siw_cm_llp_error_report; write_unlock_bh(&sk->sk_callback_lock); } static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) { sk->sk_state_change = cep->sk_state_change; sk->sk_data_ready = cep->sk_data_ready; sk->sk_write_space = cep->sk_write_space; sk->sk_error_report = cep->sk_error_report; sk->sk_user_data = NULL; } static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) { struct socket *s = cep->sock; struct sock *sk = s->sk; write_lock_bh(&sk->sk_callback_lock); qp->attrs.sk = s; sk->sk_data_ready = siw_qp_llp_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_socket_disassoc(struct socket *s) { struct sock *sk = s->sk; struct siw_cep *cep; if (sk) { write_lock_bh(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (cep) { siw_sk_restore_upcalls(sk, cep); siw_cep_put(cep); } else { pr_warn("siw: cannot restore sk callbacks: no ep\n"); } write_unlock_bh(&sk->sk_callback_lock); } else { pr_warn("siw: cannot restore sk callbacks: no sk\n"); } } static void siw_rtr_data_ready(struct sock *sk) { struct siw_cep *cep; struct siw_qp *qp = NULL; read_descriptor_t rd_desc; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { WARN(1, "No connection endpoint\n"); goto out; } qp = sk_to_qp(sk); memset(&rd_desc, 0, sizeof(rd_desc)); rd_desc.arg.data = qp; rd_desc.count = 1; tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); /* * Check if first frame was successfully processed. * Signal connection full establishment if yes. * Failed data processing would have already scheduled * connection drop. */ if (!qp->rx_stream.rx_suspend) siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); out: read_unlock(&sk->sk_callback_lock); if (qp) siw_qp_socket_assoc(cep, qp); } static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) { struct sock *sk = cep->sock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = siw_rtr_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) { cep->sock = s; siw_cep_get(cep); s->sk->sk_user_data = cep; siw_sk_assign_cm_upcalls(s->sk); } static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) { struct siw_cep *cep = kzalloc_obj(*cep); unsigned long flags; if (!cep) return NULL; INIT_LIST_HEAD(&cep->listenq); INIT_LIST_HEAD(&cep->devq); INIT_LIST_HEAD(&cep->work_freelist); kref_init(&cep->ref); cep->state = SIW_EPSTATE_IDLE; init_waitqueue_head(&cep->waitq); spin_lock_init(&cep->lock); cep->sdev = sdev; cep->enhanced_rdma_conn_est = false; spin_lock_irqsave(&sdev->lock, flags); list_add_tail(&cep->devq, &sdev->cep_list); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "new endpoint\n"); return cep; } static void siw_cm_free_work(struct siw_cep *cep) { struct list_head *w, *tmp; struct siw_cm_work *work; list_for_each_safe(w, tmp, &cep->work_freelist) { work = list_entry(w, struct siw_cm_work, list); list_del(&work->list); kfree(work); } } static void siw_cancel_mpatimer(struct siw_cep *cep) { spin_lock_bh(&cep->lock); if (cep->mpa_timer) { if (cancel_delayed_work(&cep->mpa_timer->work)) { siw_cep_put(cep); kfree(cep->mpa_timer); /* not needed again */ } cep->mpa_timer = NULL; } spin_unlock_bh(&cep->lock); } static void siw_put_work(struct siw_cm_work *work) { INIT_LIST_HEAD(&work->list); spin_lock_bh(&work->cep->lock); list_add(&work->list, &work->cep->work_freelist); spin_unlock_bh(&work->cep->lock); } static void siw_cep_set_inuse(struct siw_cep *cep) { unsigned long flags; retry: spin_lock_irqsave(&cep->lock, flags); if (cep->in_use) { spin_unlock_irqrestore(&cep->lock, flags); wait_event_interruptible(cep->waitq, !cep->in_use); if (signal_pending(current)) flush_signals(current); goto retry; } else { cep->in_use = 1; spin_unlock_irqrestore(&cep->lock, flags); } } static void siw_cep_set_free(struct siw_cep *cep) { unsigned long flags; spin_lock_irqsave(&cep->lock, flags); cep->in_use = 0; spin_unlock_irqrestore(&cep->lock, flags); wake_up(&cep->waitq); } static void __siw_cep_dealloc(struct kref *ref) { struct siw_cep *cep = container_of(ref, struct siw_cep, ref); struct siw_device *sdev = cep->sdev; unsigned long flags; WARN_ON(cep->listen_cep); /* kfree(NULL) is safe */ kfree(cep->mpa.pdata); spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) siw_cm_free_work(cep); spin_unlock_bh(&cep->lock); spin_lock_irqsave(&sdev->lock, flags); list_del(&cep->devq); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "free endpoint\n"); kfree(cep); } static struct siw_cm_work *siw_get_work(struct siw_cep *cep) { struct siw_cm_work *work = NULL; spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) { work = list_entry(cep->work_freelist.next, struct siw_cm_work, list); list_del_init(&work->list); } spin_unlock_bh(&cep->lock); return work; } static int siw_cm_alloc_work(struct siw_cep *cep, int num) { struct siw_cm_work *work; while (num--) { work = kmalloc_obj(*work); if (!work) { if (!(list_empty(&cep->work_freelist))) siw_cm_free_work(cep); return -ENOMEM; } work->cep = cep; INIT_LIST_HEAD(&work->list); list_add(&work->list, &cep->work_freelist); } return 0; } /* * siw_cm_upcall() * * Upcall to IWCM to inform about async connection events */ static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status) { struct iw_cm_event event; struct iw_cm_id *id; memset(&event, 0, sizeof(event)); event.status = status; event.event = reason; if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.provider_data = cep; id = cep->listen_cep->cm_id; } else { id = cep->cm_id; } /* Signal IRD and ORD */ if (reason == IW_CM_EVENT_ESTABLISHED || reason == IW_CM_EVENT_CONNECT_REPLY) { /* Signal negotiated IRD/ORD values we will use */ event.ird = cep->ird; event.ord = cep->ord; } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.ird = cep->ord; event.ord = cep->ird; } /* Signal private data and address information */ if (reason == IW_CM_EVENT_CONNECT_REQUEST || reason == IW_CM_EVENT_CONNECT_REPLY) { u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); if (pd_len) { /* * hand over MPA private data */ event.private_data_len = pd_len; event.private_data = cep->mpa.pdata; /* Hide MPA V2 IRD/ORD control */ if (cep->enhanced_rdma_conn_est) { event.private_data_len -= sizeof(struct mpa_v2_data); event.private_data += sizeof(struct mpa_v2_data); } } getname_local(cep->sock, &event.local_addr); getname_peer(cep->sock, &event.remote_addr); } siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); return id->event_handler(id, &event); } static void siw_free_cm_id(struct siw_cep *cep) { if (!cep->cm_id) return; cep->cm_id->rem_ref(cep->cm_id); cep->cm_id = NULL; } static void siw_destroy_cep_sock(struct siw_cep *cep) { if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } } /* * siw_qp_cm_drop() * * Drops established LLP connection if present and not already * scheduled for dropping. Called from user context, SQ workqueue * or receive IRQ. Caller signals if socket can be immediately * closed (basically, if not in IRQ). */ void siw_qp_cm_drop(struct siw_qp *qp, int schedule) { struct siw_cep *cep = qp->cep; qp->rx_stream.rx_suspend = 1; qp->tx_ctx.tx_suspend = 1; if (!qp->cep) return; if (schedule) { siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); } else { siw_cep_set_inuse(cep); if (cep->state == SIW_EPSTATE_CLOSED) { siw_dbg_cep(cep, "already closed\n"); goto out; } siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); siw_send_terminate(qp); if (cep->cm_id) { switch (cep->state) { case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); break; case SIW_EPSTATE_RDMA_MODE: siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); break; case SIW_EPSTATE_IDLE: case SIW_EPSTATE_LISTENING: case SIW_EPSTATE_CONNECTING: case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_RECVD_MPAREQ: case SIW_EPSTATE_CLOSED: default: break; } siw_free_cm_id(cep); siw_cep_put(cep); } cep->state = SIW_EPSTATE_CLOSED; siw_destroy_cep_sock(cep); if (cep->qp) { cep->qp = NULL; siw_qp_put(qp); } out: siw_cep_set_free(cep); } } void siw_cep_put(struct siw_cep *cep) { WARN_ON(kref_read(&cep->ref) < 1); kref_put(&cep->ref, __siw_cep_dealloc); } static void siw_cep_set_free_and_put(struct siw_cep *cep) { siw_cep_set_free(cep); siw_cep_put(cep); } void siw_cep_get(struct siw_cep *cep) { kref_get(&cep->ref); } /* * Expects params->pd_len in host byte order */ static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) { struct socket *s = cep->sock; struct mpa_rr *rr = &cep->mpa.hdr; struct kvec iov[3]; struct msghdr msg; int rv; int iovec_num = 0; int mpa_len; memset(&msg, 0, sizeof(msg)); iov[iovec_num].iov_base = rr; iov[iovec_num].iov_len = sizeof(*rr); mpa_len = sizeof(*rr); if (cep->enhanced_rdma_conn_est) { iovec_num++; iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); mpa_len += sizeof(cep->mpa.v2_ctrl); } if (pd_len) { iovec_num++; iov[iovec_num].iov_base = (char *)pdata; iov[iovec_num].iov_len = pd_len; mpa_len += pd_len; } if (cep->enhanced_rdma_conn_est) pd_len += sizeof(cep->mpa.v2_ctrl); rr->params.pd_len = cpu_to_be16(pd_len); rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); return rv < 0 ? rv : 0; } /* * Receive MPA Request/Reply header. * * Returns 0 if complete MPA Request/Reply header including * eventual private data was received. Returns -EAGAIN if * header was partially received or negative error code otherwise. * * Context: May be called in process context only */ static int siw_recv_mpa_rr(struct siw_cep *cep) { struct mpa_rr *hdr = &cep->mpa.hdr; struct socket *s = cep->sock; u16 pd_len; int rcvd, to_rcv; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, 0); if (rcvd <= 0) return -ECONNABORTED; cep->mpa.bytes_rcvd += rcvd; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) return -EAGAIN; if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) return -EPROTO; } pd_len = be16_to_cpu(hdr->params.pd_len); /* * At least the MPA Request/Reply header (frame not including * private data) has been received. * Receive (or continue receiving) any private data. */ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); if (!to_rcv) { /* * We must have hdr->params.pd_len == 0 and thus received a * complete MPA Request/Reply frame. * Check against peer protocol violation. */ u32 word; rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); if (rcvd == -EAGAIN) return 0; if (rcvd == 0) { siw_dbg_cep(cep, "peer EOF\n"); return -EPIPE; } if (rcvd < 0) { siw_dbg_cep(cep, "error: %d\n", rcvd); return rcvd; } siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); return -EPROTO; } /* * At this point, we must have hdr->params.pd_len != 0. * A private data buffer gets allocated if hdr->params.pd_len != 0. */ if (!cep->mpa.pdata) { cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); if (!cep->mpa.pdata) return -ENOMEM; } rcvd = ksock_recv( s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), to_rcv + 4, MSG_DONTWAIT); if (rcvd < 0) return rcvd; if (rcvd > to_rcv) return -EPROTO; cep->mpa.bytes_rcvd += rcvd; if (to_rcv == rcvd) { siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); return 0; } return -EAGAIN; } /* * siw_proc_mpareq() * * Read MPA Request from socket and signal new connection to IWCM * if success. Caller must hold lock on corresponding listening CEP. */ static int siw_proc_mpareq(struct siw_cep *cep) { struct mpa_rr *req; int version, rv; u16 pd_len; rv = siw_recv_mpa_rr(cep); if (rv) return rv; req = &cep->mpa.hdr; version = __mpa_rr_revision(req->params.bits); pd_len = be16_to_cpu(req->params.pd_len); if (version > MPA_REVISION_2) /* allow for 0, 1, and 2 only */ return -EPROTO; if (memcmp(req->key, MPA_KEY_REQ, 16)) return -EPROTO; /* Prepare for sending MPA reply */ memcpy(req->key, MPA_KEY_REP, 16); if (version == MPA_REVISION_2 && (req->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * MPA version 2 must signal IRD/ORD values and P2P mode * in private data if header flag MPA_RR_FLAG_ENHANCED * is set. */ if (pd_len < sizeof(struct mpa_v2_data)) goto reject_conn; cep->enhanced_rdma_conn_est = true; } /* MPA Markers: currently not supported. Marker TX to be added. */ if (req->params.bits & MPA_RR_FLAG_MARKERS) goto reject_conn; if (req->params.bits & MPA_RR_FLAG_CRC) { /* * RFC 5044, page 27: CRC MUST be used if peer requests it. * siw specific: 'mpa_crc_strict' parameter to reject * connection with CRC if local CRC off enforced by * 'mpa_crc_strict' module parameter. */ if (!mpa_crc_required && mpa_crc_strict) goto reject_conn; /* Enable CRC if requested by module parameter */ if (mpa_crc_required) req->params.bits |= MPA_RR_FLAG_CRC; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; /* * Peer requested ORD becomes requested local IRD, * peer requested IRD becomes requested local ORD. * IRD and ORD get limited by global maximum values. */ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; cep->ord = min(cep->ord, SIW_MAX_ORD_QP); cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; cep->ird = min(cep->ird, SIW_MAX_IRD_QP); /* May get overwritten by locally negotiated values */ cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); /* * Support for peer sent zero length Write or Read to * let local side enter RTS. Writes are preferred. * Sends would require pre-posting a Receive and are * not supported. * Propose zero length Write if none of Read and Write * is indicated. */ if (v2->ird & MPA_V2_PEER_TO_PEER) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; if (v2->ord & MPA_V2_RDMA_WRITE_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; else if (v2->ord & MPA_V2_RDMA_READ_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; else cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; } } cep->state = SIW_EPSTATE_RECVD_MPAREQ; /* Keep reference until IWCM accepts/rejects */ siw_cep_get(cep); rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); if (rv) siw_cep_put(cep); return rv; reject_conn: siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); req->params.bits &= ~MPA_RR_FLAG_MARKERS; req->params.bits |= MPA_RR_FLAG_REJECT; if (!mpa_crc_required && mpa_crc_strict) req->params.bits &= ~MPA_RR_FLAG_CRC; if (pd_len) kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; siw_send_mpareqrep(cep, NULL, 0); return -EOPNOTSUPP; } static int siw_proc_mpareply(struct siw_cep *cep) { struct siw_qp_attrs qp_attrs; enum siw_qp_attr_mask qp_attr_mask; struct siw_qp *qp = cep->qp; struct mpa_rr *rep; int rv; u16 rep_ord; u16 rep_ird; bool ird_insufficient = false; enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); if (rv) goto out_err; siw_cancel_mpatimer(cep); rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { /* allow for 0, 1, and 2 only */ rv = -EPROTO; goto out_err; } if (memcmp(rep->key, MPA_KEY_REP, 16)) { siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INVALID_REQ_RESP, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } if (rep->params.bits & MPA_RR_FLAG_REJECT) { siw_dbg_cep(cep, "got mpa reject\n"); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -ECONNRESET; } if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || (mpa_crc_strict && !mpa_crc_required && (rep->params.bits & MPA_RR_FLAG_CRC))) { siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); return -EINVAL; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2; if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * Protocol failure: The responder MUST reply with * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. */ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", __mpa_rr_revision(rep->params.bits), rep->params.bits & MPA_RR_FLAG_ENHANCED ? 1 : 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -EINVAL; } v2 = (struct mpa_v2_data *)cep->mpa.pdata; rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; if (cep->ird < rep_ord && (relaxed_ird_negotiation == false || rep_ord > cep->sdev->attrs.max_ird)) { siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", cep->ird, rep_ord, cep->sdev->attrs.max_ord); ird_insufficient = true; } if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, rep_ird); ird_insufficient = true; } /* * Always report negotiated peer values to user, * even if IRD/ORD negotiation failed */ cep->ird = rep_ord; cep->ord = rep_ird; if (ird_insufficient) { /* * If the initiator IRD is insuffient for the * responder ORD, send a TERM. */ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INSUFFICIENT_IRD, 0); siw_send_terminate(qp); rv = -ENOMEM; goto out_err; } if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) mpa_p2p_mode = cep->mpa.v2_ctrl_req.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); /* * Check if we requested P2P mode, and if peer agrees */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { if ((mpa_p2p_mode & v2->ord) == 0) { /* * We requested RTR mode(s), but the peer * did not pick any mode we support. */ siw_dbg_cep(cep, "rtr mode: req %2x, got %2x\n", mpa_p2p_mode, v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)); siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_NO_MATCHING_RTR, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); } } memset(&qp_attrs, 0, sizeof(qp_attrs)); if (rep->params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.irq_size = cep->ird; qp_attrs.orq_size = cep->ord; qp_attrs.sk = cep->sock; qp_attrs.state = SIW_QP_STATE_RTS; qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; /* Move socket RX/TX under QP control */ down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) { rv = -EINVAL; up_write(&qp->state_lock); goto out_err; } rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); siw_qp_socket_assoc(cep, qp); up_write(&qp->state_lock); /* Send extra RDMA frame to trigger peer RTS if negotiated */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); if (rv) goto out_err; } if (!rv) { rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); if (!rv) cep->state = SIW_EPSTATE_RDMA_MODE; return 0; } out_err: if (rv != -EAGAIN) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } /* * siw_accept_newconn - accept an incoming pending connection * */ static void siw_accept_newconn(struct siw_cep *cep) { struct socket *s = cep->sock; struct socket *new_s = NULL; struct siw_cep *new_cep = NULL; int rv = 0; /* debug only. should disappear */ if (cep->state != SIW_EPSTATE_LISTENING) goto error; new_cep = siw_cep_alloc(cep->sdev); if (!new_cep) goto error; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ if (siw_cm_alloc_work(new_cep, 4) != 0) goto error; /* * Copy saved socket callbacks from listening CEP * and assign new socket with new CEP */ new_cep->sk_state_change = cep->sk_state_change; new_cep->sk_data_ready = cep->sk_data_ready; new_cep->sk_write_space = cep->sk_write_space; new_cep->sk_error_report = cep->sk_error_report; rv = kernel_accept(s, &new_s, O_NONBLOCK); if (rv != 0) { /* * Connection already aborted by peer..? */ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); goto error; } new_cep->sock = new_s; siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; if (siw_tcp_nagle == false) tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); if (rv) goto error; /* * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. */ new_cep->listen_cep = cep; siw_cep_get(cep); if (atomic_read(&new_s->sk->sk_rmem_alloc)) { /* * MPA REQ already queued */ siw_dbg_cep(cep, "immediate mpa request\n"); siw_cep_set_inuse(new_cep); rv = siw_proc_mpareq(new_cep); if (rv != -EAGAIN) { siw_cep_put(cep); new_cep->listen_cep = NULL; if (rv) { siw_cancel_mpatimer(new_cep); siw_cep_set_free(new_cep); goto error; } } siw_cep_set_free(new_cep); } return; error: if (new_cep) siw_cep_put(new_cep); if (new_s) { siw_socket_disassoc(new_s); sock_release(new_s); new_cep->sock = NULL; } siw_dbg_cep(cep, "error %d\n", rv); } static void siw_cm_work_handler(struct work_struct *w) { struct siw_cm_work *work; struct siw_cep *cep; int release_cep = 0, rv = 0; work = container_of(w, struct siw_cm_work, work.work); cep = work->cep; siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, work->type, cep->state); siw_cep_set_inuse(cep); switch (work->type) { case SIW_CM_WORK_ACCEPT: siw_accept_newconn(cep); break; case SIW_CM_WORK_READ_MPAHDR: if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { if (cep->listen_cep) { siw_cep_set_inuse(cep->listen_cep); if (cep->listen_cep->state == SIW_EPSTATE_LISTENING) rv = siw_proc_mpareq(cep); else rv = -EFAULT; siw_cep_set_free(cep->listen_cep); if (rv != -EAGAIN) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; if (rv) siw_cep_put(cep); } } } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { rv = siw_proc_mpareply(cep); } else { /* * CEP already moved out of MPA handshake. * any connection management already done. * silently ignore the mpa packet. */ if (cep->state == SIW_EPSTATE_RDMA_MODE) { cep->sock->sk->sk_data_ready(cep->sock->sk); siw_dbg_cep(cep, "already in RDMA mode"); } else { siw_dbg_cep(cep, "out of state: %d\n", cep->state); } } if (rv && rv != -EAGAIN) release_cep = 1; break; case SIW_CM_WORK_CLOSE_LLP: /* * QP scheduled LLP close */ if (cep->qp) siw_send_terminate(cep->qp); if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); release_cep = 1; break; case SIW_CM_WORK_PEER_CLOSE: if (cep->cm_id) { if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA reply not received, but connection drop */ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { /* * NOTE: IW_CM_EVENT_DISCONNECT is given just * to transition IWCM into CLOSING. */ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); } /* * for other states there is no connection * known to the IWCM. */ } else { if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { /* * Wait for the ulp/CM to call accept/reject */ siw_dbg_cep(cep, "mpa req recvd, wait for ULP\n"); } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * Socket close before MPA request received. */ if (cep->listen_cep) { siw_dbg_cep(cep, "no mpareq: drop listener\n"); siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } } } release_cep = 1; break; case SIW_CM_WORK_MPATIMEOUT: cep->mpa_timer = NULL; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA request timed out: * Hide any partially received private data and signal * timeout */ cep->mpa.hdr.params.pd_len = 0; if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ETIMEDOUT); release_cep = 1; } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * No MPA request received after peer TCP stream setup. */ if (cep->listen_cep) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } release_cep = 1; } break; default: WARN(1, "Undefined CM work type: %d\n", work->type); } if (release_cep) { siw_dbg_cep(cep, "release: timer=%s, QP[%u]\n", cep->mpa_timer ? "y" : "n", cep->qp ? qp_id(cep->qp) : UINT_MAX); siw_cancel_mpatimer(cep); cep->state = SIW_EPSTATE_CLOSED; if (cep->qp) { struct siw_qp *qp = cep->qp; /* * Serialize a potential race with application * closing the QP and calling siw_qp_cm_drop() */ siw_qp_get(qp); siw_cep_set_free(cep); siw_qp_llp_close(qp); siw_qp_put(qp); siw_cep_set_inuse(cep); cep->qp = NULL; siw_qp_put(qp); } if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } if (cep->cm_id) { siw_free_cm_id(cep); siw_cep_put(cep); } } siw_cep_set_free(cep); siw_put_work(work); siw_cep_put(cep); } static struct workqueue_struct *siw_cm_wq; int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) { struct siw_cm_work *work = siw_get_work(cep); unsigned long delay = 0; if (!work) { siw_dbg_cep(cep, "failed with no work available\n"); return -ENOMEM; } work->type = type; work->cep = cep; siw_cep_get(cep); INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); if (type == SIW_CM_WORK_MPATIMEOUT) { cep->mpa_timer = work; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) delay = MPAREQ_TIMEOUT; else delay = MPAREP_TIMEOUT; } siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", cep->qp ? qp_id(cep->qp) : -1, type, delay); queue_delayed_work(siw_cm_wq, &work->work, delay); return 0; } static void siw_cm_llp_data_ready(struct sock *sk) { struct siw_cep *cep; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) goto out; siw_dbg_cep(cep, "cep state: %d, socket state %d\n", cep->state, sk->sk_state); if (sk->sk_state != TCP_ESTABLISHED) goto out; switch (cep->state) { case SIW_EPSTATE_RDMA_MODE: case SIW_EPSTATE_LISTENING: break; case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); break; default: siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); break; } out: read_unlock(&sk->sk_callback_lock); } static void siw_cm_llp_write_space(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) siw_dbg_cep(cep, "state: %d\n", cep->state); } static void siw_cm_llp_error_report(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) { siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", sk->sk_err, sk->sk_state, cep->state); cep->sk_error_report(sk); } } static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { /* endpoint already disassociated */ read_unlock(&sk->sk_callback_lock); return; } orig_state_change = cep->sk_state_change; siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { case TCP_ESTABLISHED: /* * handle accepting socket as special case where only * new connection is possible */ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); break; case TCP_CLOSE: case TCP_CLOSE_WAIT: if (cep->qp) cep->qp->tx_ctx.tx_suspend = 1; siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); break; default: siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); } read_unlock(&sk->sk_callback_lock); orig_state_change(sk); } static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr, bool afonly) { int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ sock_set_reuseaddr(s->sk); if (afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) return rv; } rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, size); if (rv < 0) return rv; rv = s->ops->connect(s, (struct sockaddr_unsized *)raddr, size, flags); return rv < 0 ? rv : 0; } int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_qp *qp; struct siw_cep *cep = NULL; struct socket *s = NULL; struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, *raddr = (struct sockaddr *)&id->remote_addr; bool p2p_mode = peer_to_peer, v4 = true; u16 pd_len = params->private_data_len; int version = mpa_version, rv; if (pd_len > MPA_MAX_PRIVDATA) return -EINVAL; if (params->ird > sdev->attrs.max_ird || params->ord > sdev->attrs.max_ord) return -ENOMEM; if (laddr->sa_family == AF_INET6) v4 = false; else if (laddr->sa_family != AF_INET) return -EAFNOSUPPORT; /* * Respect any iwarp port mapping: Use mapped remote address * if valid. Local address must not be mapped, since siw * uses kernel TCP stack. */ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || to_sockaddr_in6(id->remote_addr).sin6_port != 0) raddr = (struct sockaddr *)&id->m_remote_addr; qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %u] does not exist\n", params->qpn); rv = -EINVAL; goto error; } siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr, raddr); rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; siw_reclassify_socket(s); /* * NOTE: For simplification, connect() is called in blocking * mode. Might be reconsidered for async connection setup at * TCP level. */ rv = kernel_bindconnect(s, laddr, raddr, id->afonly); if (rv != 0) { siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } if (siw_tcp_nagle == false) tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_set_inuse(cep); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; id->add_ref(id); cep->cm_id = id; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ rv = siw_cm_alloc_work(cep, 4); if (rv != 0) { rv = -ENOMEM; goto error; } cep->ird = params->ird; cep->ord = params->ord; if (p2p_mode && cep->ord == 0) cep->ord = 1; cep->state = SIW_EPSTATE_CONNECTING; /* * Associate CEP with socket */ siw_cep_socket_assoc(cep, s); cep->state = SIW_EPSTATE_AWAIT_MPAREP; /* * Set MPA Request bits: CRC if required, no MPA Markers, * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. */ cep->mpa.hdr.params.bits = 0; if (version > MPA_REVISION_2) { pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); version = MPA_REVISION_2; /* Adjust also module parameter */ mpa_version = MPA_REVISION_2; } __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); if (try_gso) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; if (mpa_crc_required) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; /* * If MPA version == 2: * o Include ORD and IRD. * o Indicate peer-to-peer mode, if required by module * parameter 'peer_to_peer'. */ if (version == MPA_REVISION_2) { cep->enhanced_rdma_conn_est = true; cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); if (p2p_mode) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; cep->mpa.v2_ctrl.ord |= rtr_type; } /* Remember own P2P mode requested */ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; } memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); rv = siw_send_mpareqrep(cep, params->private_data, pd_len); /* * Reset private data. */ cep->mpa.hdr.params.pd_len = 0; if (rv >= 0) { rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); if (!rv) { siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); siw_cep_set_free(cep); return 0; } } error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_socket_disassoc(s); sock_release(s); cep->sock = NULL; cep->qp = NULL; cep->cm_id = NULL; id->rem_ref(id); qp->cep = NULL; siw_cep_put(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } else if (s) { sock_release(s); } if (qp) siw_qp_put(qp); return rv; } /* * siw_accept - Let SoftiWARP accept an RDMA connection request * * @id: New connection management id to be used for accepted * connection request * @params: Connection parameters provided by ULP for accepting connection * * Transition QP to RTS state, associate new CM id @id with accepted CEP * and get prepared for TCP input by installing socket callbacks. * Then send MPA Reply and generate the "connection established" event. * Socket callbacks must be installed before sending MPA Reply, because * the latter may cause a first RDMA message to arrive from the RDMA Initiator * side very quickly, at which time the socket callbacks must be ready. */ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_cep *cep = (struct siw_cep *)id->provider_data; struct siw_qp *qp; struct siw_qp_attrs qp_attrs; int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; bool wait_for_peer_rts = false; siw_cep_set_inuse(cep); siw_cep_put(cep); /* Free lingering inbound private data */ if (cep->mpa.hdr.params.pd_len) { cep->mpa.hdr.params.pd_len = 0; kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; } siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); rv = -ECONNRESET; goto free_cep; } qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %d] does not exist\n", params->qpn); goto free_cep; } down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) goto error_unlock; siw_dbg_cep(cep, "[QP %d]\n", params->qpn); if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if (params->ord > sdev->attrs.max_ord || params->ird > sdev->attrs.max_ird) { siw_dbg_cep( cep, "[QP %u]: ord %d (max %d), ird %d (max %d)\n", qp_id(qp), params->ord, sdev->attrs.max_ord, params->ird, sdev->attrs.max_ird); goto error_unlock; } if (cep->enhanced_rdma_conn_est) max_priv_data -= sizeof(struct mpa_v2_data); if (params->private_data_len > max_priv_data) { siw_dbg_cep( cep, "[QP %u]: private data length: %d (max %d)\n", qp_id(qp), params->private_data_len, max_priv_data); goto error_unlock; } if (cep->enhanced_rdma_conn_est) { if (params->ord > cep->ord) { if (relaxed_ird_negotiation) { params->ord = cep->ord; } else { cep->ird = params->ird; cep->ord = params->ord; goto error_unlock; } } if (params->ird < cep->ird) { if (relaxed_ird_negotiation && cep->ird <= sdev->attrs.max_ird) params->ird = cep->ird; else { rv = -ENOMEM; goto error_unlock; } } if (cep->mpa.v2_ctrl.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) wait_for_peer_rts = true; /* * Signal back negotiated IRD and ORD values */ cep->mpa.v2_ctrl.ord = htons(params->ord & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); cep->mpa.v2_ctrl.ird = htons(params->ird & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); } cep->ird = params->ird; cep->ord = params->ord; cep->cm_id = id; id->add_ref(id); memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.orq_size = cep->ord; qp_attrs.irq_size = cep->ird; qp_attrs.sk = cep->sock; if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.state = SIW_QP_STATE_RTS; siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; cep->state = SIW_EPSTATE_RDMA_MODE; /* Move socket RX/TX under QP control */ rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA); up_write(&qp->state_lock); if (rv) goto error; siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", qp_id(qp), params->private_data_len); rv = siw_send_mpareqrep(cep, params->private_data, params->private_data_len); if (rv != 0) goto error; if (wait_for_peer_rts) { siw_sk_assign_rtr_upcalls(cep); } else { siw_qp_socket_assoc(cep, qp); rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); if (rv) goto error; } siw_cep_set_free(cep); return 0; error_unlock: up_write(&qp->state_lock); error: siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_free_cm_id(cep); if (qp->cep) { siw_cep_put(cep); qp->cep = NULL; } cep->qp = NULL; siw_qp_put(qp); free_cep: siw_cep_set_free_and_put(cep); return rv; } /* * siw_reject() * * Local connection reject case. Send private data back to peer, * close connection and dereference connection id. */ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) { struct siw_cep *cep = (struct siw_cep *)id->provider_data; siw_cep_set_inuse(cep); siw_cep_put(cep); siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free_and_put(cep); /* put last reference */ return -ECONNRESET; } siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, pd_len); if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ siw_send_mpareqrep(cep, pdata, pd_len); } siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); return 0; } /* * siw_create_listen - Create resources for a listener's IWCM ID @id * * Starts listen on the socket address id->local_addr. * */ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; siw_reclassify_socket(s); /* * Allow binding local port when still in TIME_WAIT from last close. */ sock_set_reuseaddr(s->sk); if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, sizeof(struct sockaddr_in)); } else { struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); if (id->afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) { siw_dbg(id->device, "ip6_sock_set_v6only erro: %d\n", rv); goto error; } } /* For wildcard addr, limit binding to current device only */ if (ipv6_addr_any(&laddr->sin6_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, sizeof(struct sockaddr_in6)); } if (rv) { siw_dbg(id->device, "socket bind error: %d\n", rv); goto error; } cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_socket_assoc(cep, s); rv = siw_cm_alloc_work(cep, backlog); if (rv) { siw_dbg(id->device, "alloc_work error %d, backlog %d\n", rv, backlog); goto error; } rv = s->ops->listen(s, backlog); if (rv) { siw_dbg(id->device, "listen error %d\n", rv); goto error; } cep->cm_id = id; id->add_ref(id); /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. * * We currently use id->provider_data in three different ways: * * o For a listener's IWCM id, id->provider_data points to * the list_head of the list of listening CEPs. * Uses: siw_create_listen(), siw_destroy_listen() * * o For each accepted passive-side IWCM id, id->provider_data * points to the CEP itself. This is a consequence of * - siw_cm_upcall() setting event.provider_data = cep and * - the IWCM's cm_conn_req_handler() setting provider_data of the * new passive-side IWCM id equal to event.provider_data * Uses: siw_accept(), siw_reject() * * o For an active-side IWCM id, id->provider_data is not used at all. * */ if (!id->provider_data) { id->provider_data = kmalloc_obj(struct list_head); if (!id->provider_data) { rv = -ENOMEM; goto error; } INIT_LIST_HEAD((struct list_head *)id->provider_data); } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); return 0; error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_cep_set_inuse(cep); siw_free_cm_id(cep); cep->sock = NULL; siw_socket_disassoc(s); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } sock_release(s); dev_put(ndev); return rv; } static void siw_drop_listeners(struct iw_cm_id *id) { struct list_head *p, *tmp; /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. */ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); list_del(p); siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); siw_cep_set_inuse(cep); siw_free_cm_id(cep); if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } } int siw_destroy_listen(struct iw_cm_id *id) { if (!id->provider_data) { siw_dbg(id->device, "no cep(s)\n"); return 0; } siw_drop_listeners(id); kfree(id->provider_data); id->provider_data = NULL; return 0; } int siw_cm_init(void) { /* * create_single_workqueue for strict ordering */ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); if (!siw_cm_wq) return -ENOMEM; return 0; } void siw_cm_exit(void) { if (siw_cm_wq) destroy_workqueue(siw_cm_wq); }
1 21 47 2 71 1 53 26 1 8 4 3 10 79 79 68 11 71 36 4 31 26 5 21 9 1 1 12 88 5 1 103 6 58 56 114 114 99 15 106 97 17 4 13 21 24 41 41 4 4 8 7 7 1 1 3 3 3 33 2 2 6 10 4 9 28 28 17 11 28 124 1 2 4 1 98 18 66 91 2 2 1 4 1 81 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org> * * This software has been sponsored by Sophos Astaro <http://www.sophos.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_tables_compat.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_arp/arp_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 struct nft_xt_match_priv { void *info; }; static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx, const char *tablename) { enum nft_chain_types type = NFT_CHAIN_T_DEFAULT; const struct nft_chain *chain = ctx->chain; const struct nft_base_chain *basechain; if (!tablename || !nft_is_base_chain(chain)) return 0; basechain = nft_base_chain(chain); if (strcmp(tablename, "nat") == 0) { if (ctx->family != NFPROTO_BRIDGE) type = NFT_CHAIN_T_NAT; if (basechain->type->type != type) return -EINVAL; } return 0; } union nft_entry { struct ipt_entry e4; struct ip6t_entry e6; struct ebt_entry ebt; struct arpt_entry arp; }; static inline void nft_compat_set_par(struct xt_action_param *par, const struct nft_pktinfo *pkt, const void *xt, const void *xt_info) { par->state = pkt->state; par->thoff = nft_thoff(pkt); par->fragoff = pkt->fragoff; par->target = xt; par->targinfo = xt_info; par->hotdrop = false; } static void nft_target_eval_xt(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; struct xt_action_param xt; int ret; nft_compat_set_par(&xt, pkt, target, info); ret = target->target(skb, &xt); if (xt.hotdrop) ret = NF_DROP; switch (ret) { case XT_CONTINUE: regs->verdict.code = NFT_CONTINUE; break; default: regs->verdict.code = ret; break; } } static void nft_target_eval_bridge(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct sk_buff *skb = pkt->skb; struct xt_action_param xt; int ret; nft_compat_set_par(&xt, pkt, target, info); ret = target->target(skb, &xt); if (xt.hotdrop) ret = NF_DROP; switch (ret) { case EBT_ACCEPT: regs->verdict.code = NF_ACCEPT; break; case EBT_DROP: regs->verdict.code = NF_DROP; break; case EBT_CONTINUE: regs->verdict.code = NFT_CONTINUE; break; case EBT_RETURN: regs->verdict.code = NFT_RETURN; break; default: regs->verdict.code = ret; break; } } static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING, .len = XT_EXTENSION_MAXNAMELEN, }, [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; static void nft_target_set_tgchk_param(struct xt_tgchk_param *par, const struct nft_ctx *ctx, struct xt_target *target, void *info, union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: if (proto) entry->e6.ipv6.flags |= IP6T_F_PROTO; entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; case NFPROTO_ARP: break; } par->entryinfo = entry; par->target = target; par->targinfo = info; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; par->hook_mask = 1 << ops->hooknum; } else { par->hook_mask = 0; } par->family = ctx->family; par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) { int pad; memcpy(out, in, t->targetsize); pad = XT_ALIGN(t->targetsize) - t->targetsize; if (pad > 0) memset(out + t->targetsize, 0, pad); } static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 }, [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, }; static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; u32 l4proto; u32 flags; int err; err = nla_parse_nested_deprecated(tb, NFTA_RULE_COMPAT_MAX, attr, nft_rule_compat_policy, NULL); if (err < 0) return err; if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS]) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); if (flags & NFT_RULE_COMPAT_F_UNUSED || flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); if (l4proto > U16_MAX) return -EINVAL; *proto = l4proto; return 0; } static void nft_compat_wait_for_destructors(struct net *net) { /* xtables matches or targets can have side effects, e.g. * creation/destruction of /proc files. * The xt ->destroy functions are run asynchronously from * work queue. If we have pending invocations we thus * need to wait for those to finish. */ nf_tables_trans_destroy_flush_work(net); } static int nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { void *info = nft_expr_priv(expr); struct xt_target *target = expr->ops->data; struct xt_tgchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); if (ctx->nla[NFTA_RULE_COMPAT]) { ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); if (ret < 0) return ret; } nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); nft_compat_wait_for_destructors(ctx->net); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { if (ret == -ENOENT) { const char *modname = NULL; if (strcmp(target->name, "LOG") == 0) modname = "nf_log_syslog"; else if (strcmp(target->name, "NFLOG") == 0) modname = "nfnetlink_log"; if (modname && nft_request_module(ctx->net, "%s", modname) == -EAGAIN) return -EAGAIN; } return ret; } /* The standard target cannot be used */ if (!target->target) return -EINVAL; return 0; } static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr) { module_put(me); kfree(expr->ops); } static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); struct module *me = target->me; struct xt_tgdtor_param par; par.net = ctx->net; par.target = target; par.targinfo = info; par.family = ctx->family; if (par.target->destroy != NULL) par.target->destroy(&par); __nft_mt_tg_destroy(me, expr); } static int nft_extension_dump_info(struct sk_buff *skb, int attr, const void *info, unsigned int size, unsigned int user_size) { unsigned int info_size, aligned_size = XT_ALIGN(size); struct nlattr *nla; nla = nla_reserve(skb, attr, aligned_size); if (!nla) return -1; info_size = user_size ? : size; memcpy(nla_data(nla), info, info_size); memset(nla_data(nla) + info_size, 0, aligned_size - info_size); return 0; } static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || nft_extension_dump_info(skb, NFTA_TARGET_INFO, info, target->targetsize, target->usersize)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; ret = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING)); if (ret) return ret; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; hook_mask = 1 << ops->hooknum; if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; } return 0; } static void __nft_match_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, void *info) { struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; struct xt_action_param xt; bool ret; nft_compat_set_par(&xt, pkt, match, info); ret = match->match(skb, &xt); if (xt.hotdrop) { regs->verdict.code = NF_DROP; return; } switch (ret ? 1 : 0) { case 1: regs->verdict.code = NFT_CONTINUE; break; case 0: regs->verdict.code = NFT_BREAK; break; } } static void nft_match_large_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_xt_match_priv *priv = nft_expr_priv(expr); __nft_match_eval(expr, regs, pkt, priv->info); } static void nft_match_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { __nft_match_eval(expr, regs, pkt, nft_expr_priv(expr)); } static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING, .len = XT_EXTENSION_MAXNAMELEN }, [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; /* struct xt_mtchk_param and xt_tgchk_param look very similar */ static void nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; switch (ctx->family) { case AF_INET: entry->e4.ip.proto = proto; entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: if (proto) entry->e6.ipv6.flags |= IP6T_F_PROTO; entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; case NFPROTO_ARP: break; } par->entryinfo = entry; par->match = match; par->matchinfo = info; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; par->hook_mask = 1 << ops->hooknum; } else { par->hook_mask = 0; } par->family = ctx->family; par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) { int pad; memcpy(out, in, m->matchsize); pad = XT_ALIGN(m->matchsize) - m->matchsize; if (pad > 0) memset(out + m->matchsize, 0, pad); } static int __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[], void *info) { struct xt_match *match = expr->ops->data; struct xt_mtchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); if (ctx->nla[NFTA_RULE_COMPAT]) { ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); if (ret < 0) return ret; } nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); nft_compat_wait_for_destructors(ctx->net); return xt_check_match(&par, size, proto, inv); } static int nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr)); } static int nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xt_match_priv *priv = nft_expr_priv(expr); struct xt_match *m = expr->ops->data; int ret; priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); if (!priv->info) return -ENOMEM; ret = __nft_match_init(ctx, expr, tb, priv->info); if (ret) kfree(priv->info); return ret; } static void __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; struct module *me = match->me; struct xt_mtdtor_param par; par.net = ctx->net; par.match = match; par.matchinfo = info; par.family = ctx->family; if (par.match->destroy != NULL) par.match->destroy(&par); __nft_mt_tg_destroy(me, expr); } static void nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { __nft_match_destroy(ctx, expr, nft_expr_priv(expr)); } static void nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_xt_match_priv *priv = nft_expr_priv(expr); __nft_match_destroy(ctx, expr, priv->info); kfree(priv->info); } static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || nft_extension_dump_info(skb, NFTA_MATCH_INFO, info, match->matchsize, match->usersize)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { return __nft_match_dump(skb, expr, nft_expr_priv(expr)); } static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e, bool reset) { struct nft_xt_match_priv *priv = nft_expr_priv(e); return __nft_match_dump(skb, e, priv->info); } static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET && ctx->family != NFPROTO_BRIDGE && ctx->family != NFPROTO_ARP) return -EOPNOTSUPP; ret = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING)); if (ret) return ret; if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; hook_mask = 1 << ops->hooknum; if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; } return 0; } static int nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 family, const char *name, int rev, int target) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_compat_get_rcu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { u8 family = info->nfmsg->nfgen_family; const char *name, *fmt; struct sk_buff *skb2; int ret = 0, target; u32 rev; if (tb[NFTA_COMPAT_NAME] == NULL || tb[NFTA_COMPAT_REV] == NULL || tb[NFTA_COMPAT_TYPE] == NULL) return -EINVAL; name = nla_data(tb[NFTA_COMPAT_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); /* x_tables api checks for 'target == 1' to mean target, * everything else means 'match'. * In x_tables world, the number is set by kernel, not * userspace. */ target = nla_get_be32(tb[NFTA_COMPAT_TYPE]) == htonl(1); switch(family) { case AF_INET: fmt = "ipt_%s"; break; case AF_INET6: fmt = "ip6t_%s"; break; case NFPROTO_BRIDGE: fmt = "ebt_%s"; break; case NFPROTO_ARP: fmt = "arpt_%s"; break; default: pr_err("nft_compat: unsupported protocol %d\n", family); return -EINVAL; } if (!try_module_get(THIS_MODULE)) return -EINVAL; rcu_read_unlock(); try_then_request_module(xt_find_revision(family, name, rev, target, &ret), fmt, name); if (ret < 0) goto out_put; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; goto out_put; } /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, family, name, ret, target) <= 0) { kfree_skb(skb2); goto out_put; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); return ret; } static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, .len = NFT_COMPAT_NAME_MAX-1 }, [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get_rcu, .type = NFNL_CB_RCU, .attr_count = NFTA_COMPAT_MAX, .policy = nfnl_compat_policy_get }, }; static const struct nfnetlink_subsystem nfnl_compat_subsys = { .name = "nft-compat", .subsys_id = NFNL_SUBSYS_NFT_COMPAT, .cb_count = NFNL_MSG_COMPAT_MAX, .cb = nfnl_nft_compat_cb, }; static struct nft_expr_type nft_match_type; static bool nft_match_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct xt_match *match = expr->ops->data; return strcmp(match->name, "comment") == 0; } static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_expr_ops *ops; struct xt_match *match; unsigned int matchsize; char *mt_name; u32 rev, family; int err; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || tb[NFTA_MATCH_INFO] == NULL) return ERR_PTR(-EINVAL); mt_name = nla_data(tb[NFTA_MATCH_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); family = ctx->family; match = xt_request_find_match(family, mt_name, rev); if (IS_ERR(match)) return ERR_PTR(-ENOENT); if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO])) { err = -EINVAL; goto err; } ops = kzalloc_obj(struct nft_expr_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; } ops->type = &nft_match_type; ops->eval = nft_match_eval; ops->init = nft_match_init; ops->destroy = nft_match_destroy; ops->dump = nft_match_dump; ops->validate = nft_match_validate; ops->data = match; ops->reduce = nft_match_reduce; matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); if (matchsize > NFT_MATCH_LARGE_THRESH) { matchsize = NFT_EXPR_SIZE(sizeof(struct nft_xt_match_priv)); ops->eval = nft_match_large_eval; ops->init = nft_match_large_init; ops->destroy = nft_match_large_destroy; ops->dump = nft_match_large_dump; } ops->size = matchsize; return ops; err: module_put(match->me); return ERR_PTR(err); } static void nft_match_release_ops(const struct nft_expr_ops *ops) { struct xt_match *match = ops->data; module_put(match->me); kfree(ops); } static struct nft_expr_type nft_match_type __read_mostly = { .name = "match", .select_ops = nft_match_select_ops, .release_ops = nft_match_release_ops, .policy = nft_match_policy, .maxattr = NFTA_MATCH_MAX, .owner = THIS_MODULE, }; static struct nft_expr_type nft_target_type; static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_expr_ops *ops; struct xt_target *target; char *tg_name; u32 rev, family; int err; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || tb[NFTA_TARGET_INFO] == NULL) return ERR_PTR(-EINVAL); tg_name = nla_data(tb[NFTA_TARGET_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); family = ctx->family; if (strcmp(tg_name, XT_ERROR_TARGET) == 0 || strcmp(tg_name, XT_STANDARD_TARGET) == 0 || strcmp(tg_name, "standard") == 0) return ERR_PTR(-EINVAL); target = xt_request_find_target(family, tg_name, rev); if (IS_ERR(target)) return ERR_PTR(-ENOENT); if (!target->target) { err = -EINVAL; goto err; } if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { err = -EINVAL; goto err; } ops = kzalloc_obj(struct nft_expr_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; } ops->type = &nft_target_type; ops->size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); ops->init = nft_target_init; ops->destroy = nft_target_destroy; ops->dump = nft_target_dump; ops->validate = nft_target_validate; ops->data = target; ops->reduce = NFT_REDUCE_READONLY; if (family == NFPROTO_BRIDGE) ops->eval = nft_target_eval_bridge; else ops->eval = nft_target_eval_xt; return ops; err: module_put(target->me); return ERR_PTR(err); } static void nft_target_release_ops(const struct nft_expr_ops *ops) { struct xt_target *target = ops->data; module_put(target->me); kfree(ops); } static struct nft_expr_type nft_target_type __read_mostly = { .name = "target", .select_ops = nft_target_select_ops, .release_ops = nft_target_release_ops, .policy = nft_target_policy, .maxattr = NFTA_TARGET_MAX, .owner = THIS_MODULE, }; static int __init nft_compat_module_init(void) { int ret; ret = nft_register_expr(&nft_match_type); if (ret < 0) return ret; ret = nft_register_expr(&nft_target_type); if (ret < 0) goto err_match; ret = nfnetlink_subsys_register(&nfnl_compat_subsys); if (ret < 0) { pr_err("nft_compat: cannot register with nfnetlink.\n"); goto err_target; } return ret; err_target: nft_unregister_expr(&nft_target_type); err_match: nft_unregister_expr(&nft_match_type); return ret; } static void __exit nft_compat_module_exit(void) { nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); module_init(nft_compat_module_init); module_exit(nft_compat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("match"); MODULE_ALIAS_NFT_EXPR("target"); MODULE_DESCRIPTION("x_tables over nftables support");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 /* SPDX-License-Identifier: GPL-2.0 */ /* * Declarations of X.25 Packet Layer type objects. * * History * nov/17/96 Jonathan Naylor Initial version. * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. */ #ifndef _X25_H #define _X25_H #include <linux/x25.h> #include <linux/slab.h> #include <linux/refcount.h> #include <net/sock.h> #define X25_ADDR_LEN 16 #define X25_MAX_L2_LEN 18 /* 802.2 LLC */ #define X25_STD_MIN_LEN 3 #define X25_EXT_MIN_LEN 4 #define X25_GFI_SEQ_MASK 0x30 #define X25_GFI_STDSEQ 0x10 #define X25_GFI_EXTSEQ 0x20 #define X25_Q_BIT 0x80 #define X25_D_BIT 0x40 #define X25_STD_M_BIT 0x10 #define X25_EXT_M_BIT 0x01 #define X25_CALL_REQUEST 0x0B #define X25_CALL_ACCEPTED 0x0F #define X25_CLEAR_REQUEST 0x13 #define X25_CLEAR_CONFIRMATION 0x17 #define X25_DATA 0x00 #define X25_INTERRUPT 0x23 #define X25_INTERRUPT_CONFIRMATION 0x27 #define X25_RR 0x01 #define X25_RNR 0x05 #define X25_REJ 0x09 #define X25_RESET_REQUEST 0x1B #define X25_RESET_CONFIRMATION 0x1F #define X25_REGISTRATION_REQUEST 0xF3 #define X25_REGISTRATION_CONFIRMATION 0xF7 #define X25_RESTART_REQUEST 0xFB #define X25_RESTART_CONFIRMATION 0xFF #define X25_DIAGNOSTIC 0xF1 #define X25_ILLEGAL 0xFD /* Define the various conditions that may exist */ #define X25_COND_ACK_PENDING 0x01 #define X25_COND_OWN_RX_BUSY 0x02 #define X25_COND_PEER_RX_BUSY 0x04 /* Define Link State constants. */ enum { X25_STATE_0, /* Ready */ X25_STATE_1, /* Awaiting Call Accepted */ X25_STATE_2, /* Awaiting Clear Confirmation */ X25_STATE_3, /* Data Transfer */ X25_STATE_4, /* Awaiting Reset Confirmation */ X25_STATE_5 /* Call Accepted / Call Connected pending */ }; enum { X25_LINK_STATE_0, X25_LINK_STATE_1, X25_LINK_STATE_2, X25_LINK_STATE_3 }; #define X25_DEFAULT_T20 (180 * HZ) /* Default T20 value */ #define X25_DEFAULT_T21 (200 * HZ) /* Default T21 value */ #define X25_DEFAULT_T22 (180 * HZ) /* Default T22 value */ #define X25_DEFAULT_T23 (180 * HZ) /* Default T23 value */ #define X25_DEFAULT_T2 (3 * HZ) /* Default ack holdback value */ #define X25_DEFAULT_WINDOW_SIZE 2 /* Default Window Size */ #define X25_DEFAULT_PACKET_SIZE X25_PS128 /* Default Packet Size */ #define X25_DEFAULT_THROUGHPUT 0x0A /* Default Throughput */ #define X25_DEFAULT_REVERSE 0x00 /* Default Reverse Charging */ #define X25_SMODULUS 8 #define X25_EMODULUS 128 /* * X.25 Facilities constants. */ #define X25_FAC_CLASS_MASK 0xC0 #define X25_FAC_CLASS_A 0x00 #define X25_FAC_CLASS_B 0x40 #define X25_FAC_CLASS_C 0x80 #define X25_FAC_CLASS_D 0xC0 #define X25_FAC_REVERSE 0x01 /* also fast select */ #define X25_FAC_THROUGHPUT 0x02 #define X25_FAC_PACKET_SIZE 0x42 #define X25_FAC_WINDOW_SIZE 0x43 #define X25_MAX_FAC_LEN 60 #define X25_MAX_CUD_LEN 128 #define X25_FAC_CALLING_AE 0xCB #define X25_FAC_CALLED_AE 0xC9 #define X25_MARKER 0x00 #define X25_DTE_SERVICES 0x0F #define X25_MAX_AE_LEN 40 /* Max num of semi-octets in AE - OSI Nw */ #define X25_MAX_DTE_FACIL_LEN 21 /* Max length of DTE facility params */ /* Bitset in x25_sock->flags for misc flags */ #define X25_Q_BIT_FLAG 0 #define X25_INTERRUPT_FLAG 1 #define X25_ACCPT_APPRV_FLAG 2 /** * struct x25_route - x25 routing entry * @node - entry in x25_list_lock * @address - Start of address range * @sigdigits - Number of sig digits * @dev - More than one for MLP * @refcnt - reference counter */ struct x25_route { struct list_head node; struct x25_address address; unsigned int sigdigits; struct net_device *dev; refcount_t refcnt; }; struct x25_neigh { struct list_head node; struct net_device *dev; unsigned int state; unsigned int extended; struct sk_buff_head queue; unsigned long t20; struct timer_list t20timer; unsigned long global_facil_mask; refcount_t refcnt; }; struct x25_sock { struct sock sk; struct x25_address source_addr, dest_addr; struct x25_neigh *neighbour; unsigned int lci, cudmatchlength; unsigned char state, condition; unsigned short vs, vr, va, vl; unsigned long t2, t21, t22, t23; unsigned short fraglen; unsigned long flags; struct sk_buff_head ack_queue; struct sk_buff_head fragment_queue; struct sk_buff_head interrupt_in_queue; struct sk_buff_head interrupt_out_queue; struct timer_list timer; struct x25_causediag causediag; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; struct x25_calluserdata calluserdata; unsigned long vc_facil_mask; /* inc_call facilities mask */ }; struct x25_forward { struct list_head node; unsigned int lci; struct net_device *dev1; struct net_device *dev2; atomic_t refcnt; }; #define x25_sk(ptr) container_of_const(ptr, struct x25_sock, sk) /* af_x25.c */ extern int sysctl_x25_restart_request_timeout; extern int sysctl_x25_call_request_timeout; extern int sysctl_x25_reset_request_timeout; extern int sysctl_x25_clear_request_timeout; extern int sysctl_x25_ack_holdback_timeout; extern int sysctl_x25_forward; int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr); int x25_addr_ntoa(unsigned char *, struct x25_address *, struct x25_address *); int x25_addr_aton(unsigned char *, struct x25_address *, struct x25_address *); struct sock *x25_find_socket(unsigned int, struct x25_neigh *); void x25_destroy_socket_from_timer(struct sock *); int x25_rx_call_request(struct sk_buff *, struct x25_neigh *, unsigned int); void x25_kill_by_neigh(struct x25_neigh *); /* x25_dev.c */ void x25_send_frame(struct sk_buff *, struct x25_neigh *); int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void x25_establish_link(struct x25_neigh *); /* x25_facilities.c */ int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, struct x25_dte_facilities *, unsigned long *); int x25_create_facilities(unsigned char *, struct x25_facilities *, struct x25_dte_facilities *, unsigned long); int x25_negotiate_facilities(struct sk_buff *, struct sock *, struct x25_facilities *, struct x25_dte_facilities *); void x25_limit_facilities(struct x25_facilities *, struct x25_neigh *); /* x25_forward.c */ void x25_clear_forward_by_lci(unsigned int lci); void x25_clear_forward_by_dev(struct net_device *); int x25_forward_data(int, struct x25_neigh *, struct sk_buff *); int x25_forward_call(struct x25_address *, struct x25_neigh *, struct sk_buff *, int); /* x25_in.c */ int x25_process_rx_frame(struct sock *, struct sk_buff *); int x25_backlog_rcv(struct sock *, struct sk_buff *); /* x25_link.c */ void x25_link_control(struct sk_buff *, struct x25_neigh *, unsigned short); void x25_link_device_up(struct net_device *); void x25_link_device_down(struct net_device *); void x25_link_established(struct x25_neigh *); void x25_link_terminated(struct x25_neigh *); void x25_transmit_clear_request(struct x25_neigh *, unsigned int, unsigned char); void x25_transmit_link(struct sk_buff *, struct x25_neigh *); int x25_subscr_ioctl(unsigned int, void __user *); struct x25_neigh *x25_get_neigh(struct net_device *); void x25_link_free(void); /* x25_neigh.c */ static __inline__ void x25_neigh_hold(struct x25_neigh *nb) { refcount_inc(&nb->refcnt); } static __inline__ void x25_neigh_put(struct x25_neigh *nb) { if (refcount_dec_and_test(&nb->refcnt)) kfree(nb); } /* x25_out.c */ int x25_output(struct sock *, struct sk_buff *); void x25_kick(struct sock *); void x25_enquiry_response(struct sock *); /* x25_route.c */ struct x25_route *x25_get_route(struct x25_address *addr); struct net_device *x25_dev_get(char *); void x25_route_device_down(struct net_device *dev); int x25_route_ioctl(unsigned int, void __user *); void x25_route_free(void); static __inline__ void x25_route_hold(struct x25_route *rt) { refcount_inc(&rt->refcnt); } static __inline__ void x25_route_put(struct x25_route *rt) { if (refcount_dec_and_test(&rt->refcnt)) kfree(rt); } /* x25_subr.c */ void x25_clear_queues(struct sock *); void x25_frames_acked(struct sock *, unsigned short); void x25_requeue_frames(struct sock *); int x25_validate_nr(struct sock *, unsigned short); void x25_write_internal(struct sock *, int); int x25_decode(struct sock *, struct sk_buff *, int *, int *, int *, int *, int *); void x25_disconnect(struct sock *, int, unsigned char, unsigned char); /* x25_timer.c */ void x25_init_timers(struct sock *sk); void x25_start_heartbeat(struct sock *); void x25_start_t2timer(struct sock *); void x25_start_t21timer(struct sock *); void x25_start_t22timer(struct sock *); void x25_start_t23timer(struct sock *); void x25_stop_heartbeat(struct sock *); void x25_stop_timer(struct sock *); unsigned long x25_display_timer(struct sock *); void x25_check_rbuf(struct sock *); /* sysctl_net_x25.c */ #ifdef CONFIG_SYSCTL int x25_register_sysctl(void); void x25_unregister_sysctl(void); #else static inline int x25_register_sysctl(void) { return 0; }; static inline void x25_unregister_sysctl(void) {}; #endif /* CONFIG_SYSCTL */ struct x25_skb_cb { unsigned int flags; }; #define X25_SKB_CB(s) ((struct x25_skb_cb *) ((s)->cb)) extern struct hlist_head x25_list; extern rwlock_t x25_list_lock; extern struct list_head x25_route_list; extern rwlock_t x25_route_list_lock; extern struct list_head x25_forward_list; extern rwlock_t x25_forward_list_lock; extern struct list_head x25_neigh_list; extern rwlock_t x25_neigh_list_lock; int x25_proc_init(void); void x25_proc_exit(void); #endif
4 6 6 6 6 1 1 4 1 1 2 5 1 2 2 5 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include <net/can.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; enum skb_drop_reason reason; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0) sk_skb_reason_drop(&jsk->sk, skb, reason); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } if (ndev->reg_state != NETREG_REGISTERED) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); jsk->priv = NULL; synchronize_rcu(); j1939_priv_put(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; struct can_skb_ext *csx; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; csx = can_skb_ext_add(skb); if (!csx) { kfree_skb(skb); ret = -ENOMEM; goto failure; } csx->can_iif = ndev->ifindex; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority)); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } while (todo_size); switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) { struct sock *sk; struct j1939_sock *jsk; bool wait_rcu = false; rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { /* Skip if j1939_jsk_add() is not called on this socket. */ if (!(jsk->state & J1939_SOCK_BOUND)) continue; sk = &jsk->sk; sock_hold(sk); read_unlock_bh(&priv->j1939_socks_lock); /* Check if j1939_jsk_del() is not yet called on this socket after holding * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call * j1939_jsk_del() with socket's lock held. */ lock_sock(sk); if (jsk->state & J1939_SOCK_BOUND) { /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). * Make this socket no longer bound, by pretending as if j1939_sk_bind() * dropped old references but did not get new references. */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from * calling the corresponding j1939_priv_put(). * * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after * an RCU grace period. But since the caller is holding a ref on this * "priv", we can defer synchronize_rcu() until immediately before * the caller calls j1939_priv_put(). */ j1939_priv_put(priv); jsk->priv = NULL; wait_rcu = true; } release_sock(sk); sock_put(sk); goto rescan; } read_unlock_bh(&priv->j1939_socks_lock); if (wait_rcu) synchronize_rcu(); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, };
18 18 18 25 22 3 23 2 25 25 23 2 43 23 7 7 43 43 10 4 10 10 10 2 10 8 8 8 3 3 10 10 8 8 1 1 4 4 7 7 7 1 7 1 7 6 6 6 6 6 3 6 11 10 43 43 29 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 // SPDX-License-Identifier: GPL-2.0 #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/rcupdate.h> #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" struct ila_xlat_params { struct ila_params ip; int ifindex; }; struct ila_map { struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; }; #define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, MAX_LOCKS, LOCKS_PER_CPU, GFP_KERNEL); } static u32 hashrnd __read_mostly; static __always_inline void __ila_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static inline u32 ila_locator_hash(struct ila_locator loc) { u32 *v = (u32 *)loc.v32; __ila_hash_secret_init(); return jhash_2words(v[0], v[1], hashrnd); } static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, struct ila_addr *iaddr, int ifindex) { return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *xp) { return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ila_map *ila = obj; return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; if (ila->xp.ifindex) score += 1 << 1; return score; } static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, .automatic_shrinking = true, .obj_cmpfn = ila_cmpfn, }; static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE], ILA_CSUM_NO_ACTION); xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE], ILA_ATYPE_USE_FORMAT); if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } static inline void ila_release(struct ila_map *ila) { kfree_rcu(ila, rcu); } static void ila_free_node(struct ila_map *ila) { struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { next = rcu_access_pointer(ila->next); ila_release(ila); ila = next; } } static void ila_free_cb(void *ptr, void *arg) { ila_free_node((struct ila_map *)ptr); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { ila_xlat_addr(skb, false); return NF_ACCEPT; } static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = -1, }, }; static DEFINE_MUTEX(ila_mutex); static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ mutex_lock(&ila_mutex); if (!ilan->xlat.hooks_registered) { err = nf_register_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); if (!err) WRITE_ONCE(ilan->xlat.hooks_registered, true); } mutex_unlock(&ila_mutex); if (err) return err; } ila = kzalloc_obj(*ila); if (!ila) return -ENOMEM; ila_init_saved_csum(&xp->ip); ila->xp = *xp; order = ila_order(ila); spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; do { if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } if (order > ila_order(tila)) break; prev = tila; tila = rcu_dereference_protected(tila->next, lockdep_is_held(lock)); } while (tila); if (prev) { /* Insert in sub list of head */ RCU_INIT_POINTER(ila->next, tila); rcu_assign_pointer(prev->next, ila); } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) goto out; } } out: spin_unlock(lock); if (err) kfree(ila); return err; } static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); continue; } err = 0; if (prev) { /* Not head, just delete from list */ rcu_assign_pointer(prev->next, ila->next); } else { /* It is the head. If there is something in the * sublist we need to make a new head. */ head = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); if (head) { /* Put first entry in the sublist into the * table */ err = rhashtable_replace_fast( &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ err = rhashtable_remove_fast( &ilan->xlat.rhash_table, &ila->node, rht_params); } } ila_release(ila); break; } out: spin_unlock(lock); return err; } int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; int err; err = parse_nl_config(info, &p); if (err) return err; return ila_add_mapping(net, &p); } int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; int err; err = parse_nl_config(info, &xp); if (err) return err; ila_del_mapping(net, &xp); return 0; } static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, struct ila_map *ila) { return ila_get_lock(ilan, ila->xp.ip.locator_match); } int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct rhashtable_iter iter; struct ila_map *ila; spinlock_t *lock; int ret = 0; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter); rhashtable_walk_start(&iter); for (;;) { ila = rhashtable_walk_next(&iter); if (IS_ERR(ila)) { if (PTR_ERR(ila) == -EAGAIN) continue; ret = PTR_ERR(ila); goto done; } else if (!ila) { break; } lock = lock_from_ila_map(ilan, ila); spin_lock(lock); ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); if (!ret) ila_free_node(ila); spin_unlock(lock); if (ret) break; } done: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); return ret; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, (__force u64)ila->xp.ip.locator.v64, ILA_ATTR_PAD) || nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; } static int ila_dump_info(struct ila_map *ila, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (ila_fill_info(ila, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; struct ila_xlat_params xp; struct ila_map *ila; int ret; ret = parse_nl_config(info, &xp); if (ret) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); } rcu_read_unlock(); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } struct ila_dump_iter { struct rhashtable_iter rhiter; int skip; }; int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter; iter = kmalloc_obj(*iter); if (!iter) return -ENOMEM; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter); iter->skip = 0; cb->args[0] = (long)iter; return 0; } int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; rhashtable_walk_exit(&iter->rhiter); kfree(iter); return 0; } int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); /* Get first entry */ ila = rhashtable_walk_peek(rhiter); if (ila && !IS_ERR(ila) && skip) { /* Skip over visited entries */ while (ila && skip) { /* Skip over any ila entries in this list that we * have already dumped. */ ila = rcu_access_pointer(ila->next); skip--; } } skip = 0; for (;;) { if (IS_ERR(ila)) { ret = PTR_ERR(ila); if (ret == -EAGAIN) { /* Table has changed and iter has reset. Return * -EAGAIN to the application even if we have * written data to the skb. The application * needs to deal with this. */ goto out_ret; } else { break; } } else if (!ila) { ret = 0; break; } while (ila) { ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) goto out; skip++; ila = rcu_access_pointer(ila->next); } skip = 0; ila = rhashtable_walk_next(rhiter); } out: iter->skip = skip; ret = (skb->len ? : ret); out_ret: rhashtable_walk_stop(rhiter); return ret; } int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); int err; err = alloc_ila_locks(ilan); if (err) return err; err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); if (err) { free_bucket_spinlocks(ilan->xlat.locks); return err; } return 0; } void ila_xlat_pre_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ /* No check here that ILA type in the mapping matches what is in the * address. We assume that whatever sender gaves us can be translated. * The checksum mode however is relevant. */ rcu_read_lock(); ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); return 0; }
15 58 59 60 57 60 47 1 60 1 31 59 59 59 47 1 1 57 29 29 29 24 15 31 30 14 29 31 31 31 57 57 57 57 60 38 57 60 60 31 60 60 59 60 59 1 60 60 60 59 17 58 60 59 60 60 60 60 59 60 31 60 33 59 59 59 59 60 60 33 1 37 57 30 1 60 1 1 1 58 34 60 58 34 58 31 30 1 59 60 59 60 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 // SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ #include <linux/sched/isolation.h> #include <linux/trace_recursion.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> #include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> #include <linux/security.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> #include <asm/setup.h> #include "trace.h" /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and * reinserted. */ #define TS_MSB (0xf8ULL << 56) #define ABS_TS_MASK (~TS_MSB) static void update_pages_handler(struct work_struct *work); #define RING_BUFFER_META_MAGIC 0xBADFEED struct ring_buffer_meta { int magic; int struct_sizes; unsigned long total_size; unsigned long buffers_offset; }; struct ring_buffer_cpu_meta { unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; __u32 subbuf_size; __u32 nr_subbufs; int buffers[]; }; /* * The ring buffer header is special. We must manually up keep it. */ int ring_buffer_print_entry_header(struct trace_seq *s) { trace_seq_puts(s, "# compressed entry header\n"); trace_seq_puts(s, "\ttype_len : 5 bits\n"); trace_seq_puts(s, "\ttime_delta : 27 bits\n"); trace_seq_puts(s, "\tarray : 32 bits\n"); trace_seq_putc(s, '\n'); trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); trace_seq_printf(s, "\ttime_stamp : type == %d\n", RINGBUF_TYPE_TIME_STAMP); trace_seq_printf(s, "\tdata max type_len == %d\n", RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return !trace_seq_has_overflowed(s); } /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is * associated with the CPU it is currently executing on. A reader may read * from any per cpu buffer. * * The reader is special. For each per cpu buffer, the reader has its own * reader page. When a reader has read the entire reader page, this reader * page is swapped with another page in the ring buffer. * * Now, as long as the writer is off the reader page, the reader can do what * ever it wants with that page. The writer will never write to that page * again (as long as it is out of the ring buffer). * * Here's some silly ASCII art. * * +------+ * |reader| RING BUFFER * |page | * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * | |-->| |-->| | * +---+ +---+ +---+ * ^ | * | | * +---------------+ * * * +------+ * |reader| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | |-->| |-->| | * | +---+ +---+ +---+ * | | * | | * +------------------------------+ * * * +------+ * |buffer| RING BUFFER * |page |------------------v * +------+ +---+ +---+ +---+ * ^ | | | |-->| | * | New +---+ +---+ +---+ * | Reader------^ | * | page | * +------------------------------+ * * * After we make this swap, the reader can hand this page off to the splice * code and be done with it. It can even allocate a new page if it needs to * and swap that into the ring buffer. * * We will be using cmpxchg soon to make all this lockless. * */ /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ #ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else # define RB_FORCE_8BYTE_ALIGNMENT 1 # define RB_ARCH_ALIGNMENT 8U #endif #define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, RB_LEN_TIME_STAMP = 8, }; #define skip_time_extend(event) \ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) #define extended_time(event) \ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) static inline bool rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } static unsigned rb_event_data_length(struct ring_buffer_event *event) { unsigned length; if (event->type_len) length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; } /* * Return the length of the given event. Will return * the length of the time extend if the event is a * time extend. */ static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; case RINGBUF_TYPE_TIME_STAMP: return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: return rb_event_data_length(event); default: WARN_ON_ONCE(1); } /* not hit */ return 0; } /* * Return total length of time extend and data, * or just the event length for all other events. */ static inline unsigned rb_event_ts_length(struct ring_buffer_event *event) { unsigned len = 0; if (extended_time(event)) { /* time extends include the data event after it */ len = RB_LEN_TIME_EXTEND; event = skip_time_extend(event); } return len + rb_event_length(event); } /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of * * Returns the size of the data load of a data event. * If the event is something other than a data event, it * returns the size of the event itself. With the exception * of a TIME EXTEND, where it still returns the size of the * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length; if (extended_time(event)) event = skip_time_extend(event); length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) length -= sizeof(event->array[0]); return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; } /** * ring_buffer_event_data - return the data of the event * @event: the event to get the data from */ void *ring_buffer_event_data(struct ring_buffer_event *event) { return rb_event_data(event); } EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_buffer_cpu(buffer, cpu) \ for_each_cpu(cpu, buffer->cpumask) #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) #define TS_SHIFT 27 #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; ts = event->array[0]; ts <<= TS_SHIFT; ts += event->time_delta; return ts; } /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) #define RB_MISSED_MASK (3 << 30) struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ }; /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer * page will be at the beginning of a cache line, and thus * the least significant bits will be zero. We use this to * add flags in the list struct pointers, to make the ring buffer * lockless. */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ u32 id:30; /* ID for external mapping */ u32 range:1; /* Mapped via a range */ struct buffer_data_page *page; /* Actual data page */ }; /* * The buffer page counters, write and entries, must be reset * atomically when crossing page boundaries. To synchronize this * update, two counters are inserted into the number. One is * the actual counter for the write position or count on the page. * * The other is a counter of updaters. Before an update happens * the update partition of the counter is incremented. This will * allow the updater to update the counter atomically. * * The counter is 20 bits, and the state data is 12. */ #define RB_WRITE_MASK 0xfffff #define RB_WRITE_INTCNT (1 << 20) static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); } static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } static void free_buffer_page(struct buffer_page *bpage) { /* Range pages are not to be freed */ if (!bpage->range) free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } /* * For best performance, allocate cpu buffer data cache line sized * and per CPU. */ #define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \ kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); #define alloc_cpu_page(cpu) (struct buffer_page *) \ kzalloc_node(ALIGN(sizeof(struct buffer_page), \ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); static struct buffer_data_page *alloc_cpu_data(int cpu, int order) { struct buffer_data_page *dpage; struct page *page; gfp_t mflags; /* * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails * gracefully without invoking oom-killer and the system is not * destabilized. */ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO; page = alloc_pages_node(cpu_to_node(cpu), mflags, order); if (!page) return NULL; dpage = page_address(page); rb_init_page(dpage); return dpage; } /* * We need to fit the time_stamp delta into 27 bits. */ static inline bool test_time_stamp(u64 delta) { return !!(delta & TS_DELTA_TEST); } struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; }; /* * Structure to hold event state and handle nested events. */ struct rb_event_info { u64 ts; u64 delta; u64 before; u64 after; unsigned long length; struct buffer_page *tail_page; int add_timestamp; }; /* * Used for the add_timestamp * NONE * EXTEND - wants a time extend * ABSOLUTE - the buffer requests all events to have absolute time stamps * FORCE - force a full time stamp. */ enum { RB_ADD_STAMP_NONE = 0, RB_ADD_STAMP_EXTEND = BIT(1), RB_ADD_STAMP_ABSOLUTE = BIT(2), RB_ADD_STAMP_FORCE = BIT(3) }; /* * Used for which event context the event is in. * TRANSITION = 0 * NMI = 1 * IRQ = 2 * SOFTIRQ = 3 * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, RB_CTX_NORMAL, RB_CTX_MAX }; struct rb_time_struct { local64_t time; }; typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 /* * head_page == tail_page && head == tail then buffer is empty. */ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; atomic_t resize_disabled; struct trace_buffer *buffer; raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; /* pages generation counter, incremented when the list changes */ unsigned long cnt; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; unsigned long nest; local_t entries_bytes; local_t entries; local_t overrun; local_t commit_overrun; local_t dropped_events; local_t committing; local_t commits; local_t pages_touched; local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; unsigned long read; unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; struct rb_irq_work irq_work; }; struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; struct mutex mutex; struct ring_buffer_per_cpu **buffers; struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; bool time_stamp_abs; unsigned long range_addr_start; unsigned long range_addr_end; struct ring_buffer_meta *meta; unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; }; struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; unsigned long next_event; struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; size_t event_size; int missed_events; }; int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { struct buffer_data_page field; trace_seq_printf(s, "\tfield: u64 timestamp;\t" "offset:0;\tsize:%u;\tsigned:%u;\n", (unsigned int)sizeof(field.time_stamp), (unsigned int)is_signed_type(u64)); trace_seq_printf(s, "\tfield: local_t commit;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: int overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, (unsigned int)is_signed_type(long)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), (unsigned int)buffer->subbuf_size, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); } static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } /* * Enable this to make sure that the event passed to * ring_buffer_event_time_stamp() is not committed and also * is on the buffer that it passed in. */ //#define RB_VERIFY_EVENT #ifdef RB_VERIFY_EVENT static struct list_head *rb_list_head(struct list_head *list); static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { struct buffer_page *page = cpu_buffer->commit_page; struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); struct list_head *next; long commit, write; unsigned long addr = (unsigned long)event; bool done = false; int stop = 0; /* Make sure the event exists and is not committed yet */ do { if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) done = true; commit = local_read(&page->page->commit); write = local_read(&page->write); if (addr >= (unsigned long)&page->page->data[commit] && addr < (unsigned long)&page->page->data[write]) return; next = rb_list_head(page->list.next); page = list_entry(next, struct buffer_page, list); } while (!done); WARN_ON_ONCE(1); } #else static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, void *event) { } #endif /* * The absolute time stamp drops the 5 MSBs and some clocks may * require them. The rb_fix_abs_ts() will take a previous full * time stamp, and add the 5 MSB of that time stamp on to the * saved absolute time stamp. Then they are compared in case of * the unlikely event that the latest time stamp incremented * the 5 MSB. */ static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) { if (save_ts & TS_MSB) { abs |= save_ts & TS_MSB; /* Check for overflow */ if (unlikely(abs < save_ts)) abs += 1ULL << 59; } return abs; } static inline u64 rb_time_stamp(struct trace_buffer *buffer); /** * ring_buffer_event_time_stamp - return the event's current time stamp * @buffer: The buffer that the event is on * @event: the event to get the time stamp of * * Note, this must be called after @event is reserved, and before it is * committed to the ring buffer. And must be called from the same * context where the event was reserved (normal, softirq, irq, etc). * * Returns the time stamp associated with the current event. * If the event has an extended time stamp, then that is used as * the time stamp to return. * In the highly unlikely case that the event was nested more than * the max nesting, then the write_stamp of the buffer is returned, * otherwise current time is returned, but that really neither of * the last two cases should ever happen. */ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; unsigned int nest; u64 ts; /* If the event includes an absolute time, then just use that */ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { ts = rb_event_time_stamp(event); return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); if (WARN_ON_ONCE(!nest)) goto fail; /* Read the current saved nesting level time stamp */ if (likely(--nest < MAX_NEST)) return cpu_buffer->event_stamp[nest]; /* Shouldn't happen, warn if it does */ WARN_ONCE(1, "nest (%d) greater than max", nest); fail: rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } /** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * * Returns the number of pages that have content in the ring buffer. */ size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) { size_t read; size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); if (WARN_ON_ONCE(cnt < lost)) return 0; cnt -= lost; /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); return 0; } return cnt - read; } static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; size_t nr_pages; size_t dirty; nr_pages = cpu_buffer->nr_pages; if (!nr_pages || !full) return true; /* * Add one as dirty will never equal nr_pages, as the sub-buffer * that the writer is on is not counted as dirty. * This is needed if "buffer_percent" is set to 100. */ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; return (dirty * 100) >= (full * nr_pages); } /* * rb_wake_up_waiters - wake up tasks waiting for ring buffer input * * Schedules a delayed work to wake up any task that is blocked on the * ring buffer waiters queue. */ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); /* For waiters waiting for the first wake up */ (void)atomic_fetch_inc_release(&rbwork->seq); wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ struct ring_buffer_per_cpu *cpu_buffer = container_of(rbwork, struct ring_buffer_per_cpu, irq_work); /* Called from interrupt context */ raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; /* Waking up all waiters, they will reset the shortest full */ cpu_buffer->shortest_full = 0; raw_spin_unlock(&cpu_buffer->reader_lock); wake_up_all(&rbwork->full_waiters); } } /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. */ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (!buffer) return; if (cpu == RING_BUFFER_ALL_CPUS) { /* Wake up individual ones too. One level recursion */ for_each_buffer_cpu(buffer, cpu) ring_buffer_wake_waiters(buffer, cpu); rbwork = &buffer->irq_work; } else { if (WARN_ON_ONCE(!buffer->buffers)) return; if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpu_buffer = buffer->buffers[cpu]; /* The CPU buffer may not have been initialized yet */ if (!cpu_buffer) return; rbwork = &cpu_buffer->irq_work; } /* This can be called in any context */ irq_work_queue(&rbwork->work); } static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer; bool ret = false; /* Reads of all CPUs always waits for any data */ if (cpu == RING_BUFFER_ALL_CPUS) return !ring_buffer_empty(buffer); cpu_buffer = buffer->buffers[cpu]; if (!ring_buffer_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; if (!full) return true; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; ret = !pagebusy && full_hit(buffer, cpu, full); if (!ret && (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full)) { cpu_buffer->shortest_full = full; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } return ret; } static inline bool rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { if (rb_watermark_hit(buffer, cpu, full)) return true; if (cond(data)) return true; /* * The events can happen in critical sections where * checking a work queue can cause deadlocks. * After adding a task to the queue, this flag is set * only to notify events to try to wake up the queue * using irq_work. * * We don't clear it even if the buffer is no longer * empty. The flag only causes the next event to run * irq_work to do the work queue wake up. The worse * that can happen if we race with !trace_empty() is that * an event will cause an irq_work to try to wake up * an empty queue. * * There's no reason to protect this flag either, as * the work queue and irq_work logic will do the necessary * synchronization for the wake ups. The only thing * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ if (full) rbwork->full_waiters_pending = true; else rbwork->waiters_pending = true; return false; } struct rb_wait_data { struct rb_irq_work *irq_work; int seq; }; /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { struct rb_wait_data *rdata = data; struct rb_irq_work *rbwork = rdata->irq_work; return atomic_read_acquire(&rbwork->seq) != rdata->seq; } /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * @cond: condition function to break out of wait (NULL to run once) * @data: the data to pass to @cond. * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; struct rb_irq_work *rbwork; struct rb_wait_data rdata; int ret = 0; /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; /* Full only makes sense on per cpu reads */ full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) waitq = &rbwork->full_waiters; else waitq = &rbwork->waiters; /* Set up to exit loop as soon as it is woken */ if (!cond) { cond = rb_wait_once; rdata.irq_work = rbwork; rdata.seq = atomic_read_acquire(&rbwork->seq); data = &rdata; } ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } /** * ring_buffer_poll_wait - poll on buffer input * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @filp: the file descriptor * @poll_table: The poll descriptor * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; rbwork = &cpu_buffer->irq_work; } if (full) { poll_wait(filp, &rbwork->full_waiters, poll_table); if (rb_watermark_hit(buffer, cpu, full)) return EPOLLIN | EPOLLRDNORM; /* * Only allow full_waiters_pending update to be seen after * the shortest_full is set (in rb_watermark_hit). If the * writer sees the full_waiters_pending flag set, it will * compare the amount in the ring buffer to shortest_full. * If the amount in the ring buffer is greater than the * shortest_full percent, it will call the irq_work handler * to wake up this list. The irq_handler will reset shortest_full * back to zero. That's done under the reader_lock, but * the below smp_mb() makes sure that the update to * full_waiters_pending doesn't leak up into the above. */ smp_mb(); rbwork->full_waiters_pending = true; return 0; } poll_wait(filp, &rbwork->waiters, poll_table); rbwork->waiters_pending = true; /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit * is set, the next event will wake the task up, but we can get stuck * if there's only a single event in. * * FIXME: Ideally, we need a memory barrier on the writer side as well, * but adding a memory barrier to all events will cause too much of a * performance hit in the fast path. We only need a memory barrier when * the buffer goes from empty to having content. But as this race is * extremely small, and it's not a problem if another event comes in, we * will fix it later. */ smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; return 0; } /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ int _____ret = unlikely(cond); \ if (_____ret) { \ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ struct ring_buffer_per_cpu *__b = \ (void *)b; \ atomic_inc(&__b->buffer->record_disabled); \ } else \ atomic_inc(&b->record_disabled); \ WARN_ON(1); \ } \ _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 static inline u64 rb_time_stamp(struct trace_buffer *buffer) { u64 ts; /* Skip retpolines :-( */ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local)) ts = trace_clock_local(); else ts = buffer->clock(); /* shift to debug/test normalization and TIME_EXTENTS */ return ts << DEBUG_SHIFT; } u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; preempt_disable_notrace(); time = rb_time_stamp(buffer); preempt_enable_notrace(); return time; } EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts) { /* Just stupid testing the normalize function and deltas */ *ts >>= DEBUG_SHIFT; } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); /* * Making the ring buffer lockless makes things tricky. * Although writes only happen on the CPU that they are on, * and they only need to worry about interrupts. Reads can * happen on any CPU. * * The reader page is always off the ring buffer, but when the * reader finishes with a page, it needs to swap its page with * a new one from the buffer. The reader needs to take from * the head (writes go to the tail). But if a writer is in overwrite * mode and wraps, it must push the head page forward. * * Here lies the problem. * * The reader must be careful to replace only the head page, and * not another one. As described at the top of the file in the * ASCII art, the reader sets its old page to point to the next * page after head. It then sets the page after head to point to * the old reader page. But if the writer moves the head page * during this operation, the reader could end up with the tail. * * We use cmpxchg to help prevent this race. We also do something * special with the page before head. We set the LSB to 1. * * When the writer must push the page forward, it will clear the * bit that points to the head page, move the head, and then set * the bit that points to the new head page. * * We also don't want an interrupt coming in and moving the head * page on another writer. Thus we use the second LSB to catch * that too. Thus: * * head->list->prev->next bit 1 bit 0 * ------- ------- * Normal page 0 0 * Points to head page 0 1 * New head page 1 0 * * Note we can not trust the prev pointer of the head page, because: * * +----+ +-----+ +-----+ * | |------>| T |---X--->| N | * | |<------| | | | * +----+ +-----+ +-----+ * ^ ^ | * | +-----+ | | * +----------| R |----------+ | * | |<-----------+ * +-----+ * * Key: ---X--> HEAD flag set in pointer * T Tail page * R Reader page * N Next page * * (see __rb_reserve_next() to see where this happens) * * What the above shows is that the reader just swapped out * the reader page with a page in the buffer, but before it * could make the new header point back to the new page added * it was preempted by a writer. The writer moved forward onto * the new page added by the reader and is about to move forward * again. * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only * temporarily. */ #define RB_PAGE_NORMAL 0UL #define RB_PAGE_HEAD 1UL #define RB_PAGE_UPDATE 2UL #define RB_FLAG_MASK 3UL /* PAGE_MOVED is not part of the mask */ #define RB_PAGE_MOVED 4UL /* * rb_list_head - remove any bit */ static struct list_head *rb_list_head(struct list_head *list) { unsigned long val = (unsigned long)list; return (struct list_head *)(val & ~RB_FLAG_MASK); } /* * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to * the reader page). But if the next page is a header page, * its flags will be non zero. */ static inline int rb_is_head_page(struct buffer_page *page, struct list_head *list) { unsigned long val; val = (unsigned long)list->next; if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) return RB_PAGE_MOVED; return val & RB_FLAG_MASK; } /* * rb_is_reader_page * * The unique thing about the reader page, is that, if the * writer is ever on it, the previous pointer never points * back to the reader page. */ static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; return rb_list_head(list->next) != &page->list; } /* * rb_set_list_to_head - set a list_head to be pointing to head. */ static void rb_set_list_to_head(struct list_head *list) { unsigned long *ptr; ptr = (unsigned long *)&list->next; *ptr |= RB_PAGE_HEAD; *ptr &= ~RB_PAGE_UPDATE; } /* * rb_head_page_activate - sets up head page */ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; head = cpu_buffer->head_page; if (!head) return; /* * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } static void rb_list_head_clear(struct list_head *list) { unsigned long *ptr = (unsigned long *)&list->next; *ptr &= ~RB_FLAG_MASK; } /* * rb_head_page_deactivate - clears head page ptr (for free list) */ static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *hd; /* Go through the whole list and clear any pointers found. */ rb_list_head_clear(cpu_buffer->pages); list_for_each(hd, cpu_buffer->pages) rb_list_head_clear(hd); } static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag, int new_flag) { struct list_head *list; unsigned long val = (unsigned long)&head->list; unsigned long ret; list = &prev->list; val &= ~RB_FLAG_MASK; ret = cmpxchg((unsigned long *)&list->next, val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) return RB_PAGE_MOVED; return ret & RB_FLAG_MASK; } static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_UPDATE); } static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_HEAD); } static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *head, struct buffer_page *prev, int old_flag) { return rb_head_page_set(cpu_buffer, head, prev, old_flag, RB_PAGE_NORMAL); } static inline void rb_inc_page(struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.next); *bpage = list_entry(p, struct buffer_page, list); } static inline void rb_dec_page(struct buffer_page **bpage) { struct list_head *p = rb_list_head((*bpage)->list.prev); *bpage = list_entry(p, struct buffer_page, list); } static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *head; struct buffer_page *page; struct list_head *list; int i; if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) return NULL; /* sanity check */ list = cpu_buffer->pages; if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) return NULL; page = head = cpu_buffer->head_page; /* * It is possible that the writer moves the header behind * where we started, and we miss in one loop. * A second loop should grab the header, but we'll do * three loops just because I'm paranoid. */ for (i = 0; i < 3; i++) { do { if (rb_is_head_page(page, page->list.prev)) { cpu_buffer->head_page = page; return page; } rb_inc_page(&page); } while (page != head); } RB_WARN_ON(cpu_buffer, 1); return NULL; } static bool rb_head_page_replace(struct buffer_page *old, struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; return try_cmpxchg(ptr, &val, (unsigned long)&new->list); } /* * rb_tail_page_update - move the tail page forward */ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { unsigned long old_entries; unsigned long old_write; /* * The tail page now needs to be moved forward. * * We need to reset the tail page, but without messing * with possible erasing of data brought in by interrupts * that have moved the tail page and are currently on it. * * We add a counter to the write field to denote this. */ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. */ barrier(); /* * If the tail page is still the same as what we think * it is, then it is up to us to update the tail * pointer. */ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; /* * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. * * We add (void) to let the compiler know that we do not care * about the return value of these functions. We use the * cmpxchg to only update if an interrupt did not already * do it for us. If the cmpxchg fails, we don't care. */ (void)local_cmpxchg(&next_page->write, old_write, val); (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. * it only can increment when a commit takes place. But that * only happens in the outer most nested commit. */ local_set(&next_page->page->commit, 0); /* Either we update tail_page or an interrupt does */ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) local_inc(&cpu_buffer->pages_touched); } } static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { unsigned long val = (unsigned long)bpage; RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); } static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *list) { if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(list->next)->prev) != list)) return false; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(list->prev)->next) != list)) return false; return true; } /** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not * been corrupted. */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head, *tmp; unsigned long buffer_cnt; unsigned long flags; int nr_loops = 0; /* * Walk the linked list underpinning the ring buffer and validate all * its next and prev links. * * The check acquires the reader_lock to avoid concurrent processing * with code that could be modifying the list. However, the lock cannot * be held for the entire duration of the walk, as this would make the * time when interrupts are disabled non-deterministic, dependent on the * ring buffer size. Therefore, the code releases and re-acquires the * lock after checking each page. The ring_buffer_per_cpu.cnt variable * is then used to detect if the list was modified while the lock was * not held, in which case the check needs to be restarted. * * The code attempts to perform the check at most three times before * giving up. This is acceptable because this is only a self-validation * to detect problems early on. In practice, the list modification * operations are fairly spaced, and so this check typically succeeds at * most on the second try. */ again: if (++nr_loops > 3) return; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); head = rb_list_head(cpu_buffer->pages); if (!rb_check_links(cpu_buffer, head)) goto out_locked; buffer_cnt = cpu_buffer->cnt; tmp = head; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); while (true) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (buffer_cnt != cpu_buffer->cnt) { /* The list was updated, try again. */ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); goto again; } tmp = rb_list_head(tmp->next); if (tmp == head) /* The iteration circled back, all is done. */ goto out_locked; if (!rb_check_links(cpu_buffer, tmp)) goto out_locked; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } out_locked: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /* * Take an address, add the meta data size as well as the array of * array subbuffer indexes, then align it to a subbuffer size. * * This is used to help find the next per cpu subbuffer within a mapped range. */ static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } /* * Return the ring_buffer_meta for a given @cpu. */ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; struct ring_buffer_cpu_meta *meta; struct ring_buffer_meta *bmeta; unsigned long ptr; int nr_subbufs; bmeta = buffer->meta; if (!bmeta) return NULL; ptr = (unsigned long)bmeta + bmeta->buffers_offset; meta = (struct ring_buffer_cpu_meta *)ptr; /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { nr_subbufs = meta->nr_subbufs; } else { /* Include the reader page */ nr_subbufs = nr_pages + 1; } /* * The first chunk may not be subbuffer aligned, where as * the rest of the chunks are. */ if (cpu) { ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* We can use multiplication to find chunks greater than 1 */ if (cpu > 1) { unsigned long size; unsigned long p; /* Save the beginning of this CPU chunk */ p = ptr; ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* Now all chunks after this are the same size */ size = ptr - p; ptr += size * (cpu - 2); } } return (void *)ptr; } /* Return the start of subbufs given the meta pointer */ static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; ptr = (unsigned long)meta; ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs); return (void *)ptr; } /* * Return a specific sub-buffer for a given @cpu defined by @idx. */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu); if (!meta) return NULL; if (WARN_ON_ONCE(idx >= meta->nr_subbufs)) return NULL; subbuf_size = meta->subbuf_size; /* Map this buffer to the order that's in meta->buffers[] */ idx = meta->buffers[idx]; ptr = (unsigned long)rb_subbufs_from_meta(meta); ptr += subbuf_size * idx; if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end) return NULL; return (void *)ptr; } /* * See if the existing memory contains a valid meta section. * if so, use that, otherwise initialize it. */ static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) { unsigned long ptr = buffer->range_addr_start; struct ring_buffer_meta *bmeta; unsigned long total_size; int struct_sizes; bmeta = (struct ring_buffer_meta *)ptr; buffer->meta = bmeta; total_size = buffer->range_addr_end - buffer->range_addr_start; struct_sizes = sizeof(struct ring_buffer_cpu_meta); struct_sizes |= sizeof(*bmeta) << 16; /* The first buffer will start word size after the meta page */ ptr += sizeof(*bmeta); ptr = ALIGN(ptr, sizeof(long)); ptr += scratch_size; if (bmeta->magic != RING_BUFFER_META_MAGIC) { pr_info("Ring buffer boot meta mismatch of magic\n"); goto init; } if (bmeta->struct_sizes != struct_sizes) { pr_info("Ring buffer boot meta mismatch of struct size\n"); goto init; } if (bmeta->total_size != total_size) { pr_info("Ring buffer boot meta mismatch of total size\n"); goto init; } if (bmeta->buffers_offset > bmeta->total_size) { pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); goto init; } if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); goto init; } return true; init: bmeta->magic = RING_BUFFER_META_MAGIC; bmeta->struct_sizes = struct_sizes; bmeta->total_size = total_size; bmeta->buffers_offset = (void *)ptr - (void *)bmeta; /* Zero out the scratch pad */ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); return false; } /* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, struct trace_buffer *buffer, int nr_pages, unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; unsigned long buffers_start; unsigned long buffers_end; int i; if (!subbuf_mask) return false; buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); /* Is the head and commit buffers within the range of buffers? */ if (meta->head_buffer < buffers_start || meta->head_buffer >= buffers_end) { pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu); return false; } if (meta->commit_buffer < buffers_start || meta->commit_buffer >= buffers_end) { pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu); return false; } subbuf = rb_subbufs_from_meta(meta); bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || meta->buffers[i] >= meta->nr_subbufs) { pr_info("Ring buffer boot meta [%d] array out of range\n", cpu); return false; } if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); return false; } if (test_bit(meta->buffers[i], subbuf_mask)) { pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); return false; } set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } return true; } static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) { struct ring_buffer_event *event; u64 ts, delta; int events = 0; int len; int e; *delta_ptr = 0; *timestamp = 0; ts = dpage->time_stamp; for (e = 0; e < tail; e += len) { event = (struct ring_buffer_event *)(dpage->data + e); len = rb_event_length(event); if (len <= 0 || len > tail - e) return -1; switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, ts); if (delta < ts) { *delta_ptr = delta; *timestamp = ts; return -1; } ts = delta; break; case RINGBUF_TYPE_PADDING: if (event->time_delta == 1) break; fallthrough; case RINGBUF_TYPE_DATA: events++; ts += event->time_delta; break; default: return -1; } } *timestamp = ts; return events; } static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) { unsigned long long ts; u64 delta; int tail; tail = local_read(&dpage->commit); return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); } /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page, *orig_head; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; u64 ts; int i; if (!meta || !meta->head_buffer) return; orig_head = head_page = cpu_buffer->head_page; /* Do the reader page first */ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); ts = head_page->page->time_stamp; /* * Try to rewind the head so that we can read the pages which already * read in the previous boot. */ if (head_page == cpu_buffer->tail_page) goto skip_rewind; rb_dec_page(&head_page); for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { /* Rewind until tail (writer) page. */ if (head_page == cpu_buffer->tail_page) break; /* Ensure the page has older data than head. */ if (ts < head_page->page->time_stamp) break; ts = head_page->page->time_stamp; /* Ensure the page has correct timestamp and some data. */ if (!ts || rb_page_commit(head_page) == 0) break; /* Stop rewind if the page is invalid. */ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); if (ret < 0) break; /* Recover the number of entries and update stats. */ local_set(&head_page->entries, ret); if (ret) local_inc(&cpu_buffer->pages_touched); entries += ret; entry_bytes += rb_page_commit(head_page); } if (i) pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); /* The last rewound page must be skipped. */ if (head_page != orig_head) rb_inc_page(&head_page); /* * If the ring buffer was rewound, then inject the reader page * into the location just before the original head page. */ if (head_page != orig_head) { struct buffer_page *bpage = orig_head; rb_dec_page(&bpage); /* * Insert the reader_page before the original head page. * Since the list encode RB_PAGE flags, general list * operations should be avoided. */ cpu_buffer->reader_page->list.next = &orig_head->list; cpu_buffer->reader_page->list.prev = orig_head->list.prev; orig_head->list.prev = &cpu_buffer->reader_page->list; bpage->list.next = &cpu_buffer->reader_page->list; /* Make the head_page the reader page */ cpu_buffer->reader_page = head_page; bpage = head_page; rb_inc_page(&head_page); head_page->list.prev = bpage->list.prev; rb_dec_page(&bpage); bpage->list.next = &head_page->list; rb_set_list_to_head(&bpage->list); cpu_buffer->pages = &head_page->list; cpu_buffer->head_page = head_page; meta->head_buffer = (unsigned long)head_page->page; /* Reset all the indexes */ bpage = cpu_buffer->reader_page; meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); bpage->id = 0; for (i = 1, bpage = head_page; i < meta->nr_subbufs; i++, rb_inc_page(&bpage)) { meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); bpage->id = i; } /* We'll restart verifying from orig_head */ head_page = orig_head; } skip_rewind: /* If the commit_buffer is the reader page, update the commit page */ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { cpu_buffer->commit_page = cpu_buffer->reader_page; /* Nothing more to do, the only page is the reader page */ goto done; } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { /* Reader page has already been done */ if (head_page == cpu_buffer->reader_page) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer meta [%d] invalid buffer page\n", cpu_buffer->cpu); goto invalid; } /* If the buffer has content, update pages_touched */ if (ret) local_inc(&cpu_buffer->pages_touched); entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; } if (head_page != cpu_buffer->commit_page) { pr_info("Ring buffer meta [%d] commit page not found\n", cpu_buffer->cpu); goto invalid; } done: local_set(&cpu_buffer->entries, entries); local_set(&cpu_buffer->entries_bytes, entry_bytes); pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); return; invalid: /* The content of the buffers are invalid, reset the meta data */ meta->head_buffer = 0; meta->commit_buffer = 0; /* Reset the reader page */ local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Reset all the subbuffers */ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { local_set(&head_page->entries, 0); local_set(&head_page->page->commit, 0); } } static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { struct ring_buffer_cpu_meta *meta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; bool valid = false; int cpu; int i; /* Create a mask to test the subbuf array */ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ if (rb_meta_init(buffer, scratch_size)) valid = true; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; continue; } if (cpu < nr_cpu_ids - 1) next_meta = rb_range_meta(buffer, nr_pages, cpu + 1); else next_meta = (void *)buffer->range_addr_end; memset(meta, 0, next_meta - (void *)meta); meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; /* * The buffers[] array holds the order of the sub-buffers * that are after the meta data. The sub-buffers may * be swapped out when read and inserted into a different * location of the ring buffer. Although their addresses * remain the same, the buffers[] array contains the * index into the sub-buffers holding their actual order. */ for (i = 0; i < meta->nr_subbufs; i++) { meta->buffers[i] = i; rb_init_page(subbuf); subbuf += meta->subbuf_size; } } bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) return NULL; if (*pos > meta->nr_subbufs) return NULL; val = *pos; val++; return (void *)val; } static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rbm_start(m, pos); } static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { seq_printf(m, "head_buffer: %d\n", rb_meta_subbuf_idx(meta, (void *)meta->head_buffer)); seq_printf(m, "commit_buffer: %d\n", rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer)); seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size); seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs); return 0; } val -= 2; seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); return 0; } static void rbm_stop(struct seq_file *m, void *p) { } static const struct seq_operations rb_meta_seq_ops = { .start = rbm_start, .next = rbm_next, .show = rbm_show, .stop = rbm_stop, }; int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu) { struct seq_file *m; int ret; ret = seq_open(file, &rb_meta_seq_ops); if (ret) return ret; m = file->private_data; m->private = buffer->buffers[cpu]; return 0; } /* Map the buffer_pages to the previous head and commit pages */ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; if (meta->commit_buffer == (unsigned long)bpage->page) { cpu_buffer->commit_page = bpage; cpu_buffer->tail_page = bpage; } } static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; long i; /* * Check if the available memory is there first. * Note, si_mem_available() only gives us a rough estimate of available * memory. It may not be accurate. But we don't care, we just want * to prevent doing any allocation when it is obvious that it is * not going to succeed. */ i = si_mem_available(); if (i < nr_pages) return -ENOMEM; /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen * even with RETRY_MAYFAIL because another task may be doing * an allocation after this task has taken all memory. * This is the task the OOM killer needs to take out during this * loop, even if it was triggered by an allocation somewhere else. */ if (user_thread) set_current_oom_origin(); if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); if (!bpage) goto free_pages; rb_check_bpage(cpu_buffer, bpage); /* * Append the pages as for mapped buffers we want to keep * the order */ list_add_tail(&bpage->list, pages); if (meta) { /* A range was given. Use that for the buffer page */ bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; /* If this is valid from a previous boot */ if (meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); if (!bpage->page) goto free_pages; } bpage->order = cpu_buffer->buffer->subbuf_order; if (user_thread && fatal_signal_pending(current)) goto free_pages; } if (user_thread) clear_current_oom_origin(); return 0; free_pages: list_for_each_entry_safe(bpage, tmp, pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } if (user_thread) clear_current_oom_origin(); return -ENOMEM; } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { LIST_HEAD(pages); WARN_ON(!nr_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to * other pages. */ cpu_buffer->pages = pages.next; list_del(&pages); cpu_buffer->nr_pages = nr_pages; rb_check_pages(cpu_buffer); return 0; } static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = alloc_cpu_buffer(cpu); struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; int ret; if (!cpu_buffer) return NULL; cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); bpage = alloc_cpu_page(cpu); if (!bpage) return NULL; rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; if (buffer->range_addr_start) { /* * Range mapped buffers have the same restrictions as memory * mapped ones do. */ cpu_buffer->mapped = 1; cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu); bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); if (!bpage->page) goto fail_free_reader; } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; rb_meta_validate_events(cpu_buffer); /* If the boot meta was valid then this has already been updated */ meta = cpu_buffer->ring_meta; if (!meta || !meta->head_buffer || !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) { if (meta && meta->head_buffer && (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) { pr_warn("Ring buffer meta buffers not all mapped\n"); if (!cpu_buffer->head_page) pr_warn(" Missing head_page\n"); if (!cpu_buffer->commit_page) pr_warn(" Missing commit_page\n"); if (!cpu_buffer->tail_page) pr_warn(" Missing tail_page\n"); } cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; rb_head_page_activate(cpu_buffer); if (cpu_buffer->ring_meta) meta->commit_buffer = meta->head_buffer; } else { /* The valid meta buffer still needs to activate the head page */ rb_head_page_activate(cpu_buffer); } return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); return NULL; } static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; irq_work_sync(&cpu_buffer->irq_work.work); free_buffer_page(cpu_buffer->reader_page); if (head) { rb_head_page_deactivate(cpu_buffer); list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } free_page((unsigned long)cpu_buffer->free_page); kfree(cpu_buffer); } static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, struct lock_class_key *key) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; int cpu; int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), GFP_KERNEL); if (!buffer) return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), GFP_KERNEL); if (!buffer->buffers) goto fail_free_cpumask; /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; unsigned long ptr; int n; /* Make sure that start is word aligned */ start = ALIGN(start, sizeof(long)); /* scratch_size needs to be aligned too */ scratch_size = ALIGN(scratch_size, sizeof(long)); /* Subtract the buffer meta data and word aligned */ buffers_start = start + sizeof(struct ring_buffer_cpu_meta); buffers_start = ALIGN(buffers_start, sizeof(long)); buffers_start += scratch_size; /* Calculate the size for the per CPU data */ size = end - buffers_start; size = size / nr_cpu_ids; /* * The number of sub-buffers (nr_pages) is determined by the * total size allocated minus the meta data size. * Then that is divided by the number of per CPU buffers * needed, plus account for the integer array index that * will be appended to the meta data. */ nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) goto fail_free_buffers; again: /* Make sure that the size fits aligned */ for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; } if (ptr > end) { if (nr_pages <= 3) goto fail_free_buffers; nr_pages--; goto again; } /* nr_pages should not count the reader page */ nr_pages--; buffer->range_addr_start = start; buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); } else { /* need at least two pages */ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); if (nr_pages < 2) nr_pages = 2; } cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); if (ret < 0) goto fail_free_buffers; mutex_init(&buffer->mutex); return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { if (buffer->buffers[cpu]) rb_free_cpu_buffer(buffer->buffers[cpu]); } kfree(buffer->buffers); fail_free_cpumask: free_cpumask_var(buffer->cpumask); return NULL; } /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ return alloc_buffer(size, flags, 0, 0, 0, 0, key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE * flag. This flag means that the buffer will overwrite old data * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, unsigned long scratch_size, struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, scratch_size, key); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) { struct ring_buffer_meta *meta; void *ptr; if (!buffer || !buffer->meta) return NULL; meta = buffer->meta; ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); if (size) *size = (void *)meta + meta->buffers_offset - ptr; return ptr; } /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. */ void ring_buffer_free(struct trace_buffer *buffer) { int cpu; cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); } EXPORT_SYMBOL_GPL(ring_buffer_free); void ring_buffer_set_clock(struct trace_buffer *buffer, u64 (*clock)(void)) { buffer->clock = clock; } void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs) { buffer->time_stamp_abs = abs; } bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) { return buffer->time_stamp_abs; } static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; } static inline unsigned long rb_page_write(struct buffer_page *bpage) { return local_read(&bpage->write) & RB_WRITE_MASK; } static bool rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; unsigned long nr_removed; unsigned long head_bit; int page_entries; head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); atomic_inc(&cpu_buffer->record_disabled); /* * We don't race with the readers since we have acquired the reader * lock. We also don't race with writers after disabling recording. * This makes it easy to figure out the first and the last page to be * removed from the list. We unlink all the pages in between including * the first and last pages. This is done in a busy loop so that we * lose the least number of traces. * The pages are freed after we restart recording and unlock readers. */ tail_page = &cpu_buffer->tail_page->list; /* * tail page might be on reader page, we remove the next page * from the ring buffer */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) tail_page = rb_list_head(tail_page->next); to_remove = tail_page; /* start of pages to remove */ first_page = list_entry(rb_list_head(to_remove->next), struct buffer_page, list); for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } /* Read iterators need to reset themselves when some pages removed */ cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; /* * Now we remove all pages between tail_page and next_page. * Make sure that we have head_bit value preserved for the * next page */ tail_page->next = (struct list_head *)((unsigned long)next_page | head_bit); next_page = rb_list_head(next_page); next_page->prev = tail_page; /* make sure pages points to a valid page in the ring buffer */ cpu_buffer->pages = next_page; cpu_buffer->cnt++; /* update head page */ if (head_bit) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); /* last buffer page to remove */ last_page = list_entry(rb_list_head(to_remove), struct buffer_page, list); tmp_iter_page = first_page; do { cond_resched(); to_remove_page = tmp_iter_page; rb_inc_page(&tmp_iter_page); /* update the counters */ page_entries = rb_page_entries(to_remove_page); if (page_entries) { /* * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } /* * We have already removed references to this list item, just * free up the buffer_page and its page */ free_buffer_page(to_remove_page); nr_removed--; } while (to_remove_page != last_page); RB_WARN_ON(cpu_buffer, nr_removed); return nr_removed == 0; } static bool rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; unsigned long flags; bool success; int retries; /* Can be called at early boot up, where interrupts must not been enabled */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to * move head page and the tail page. * We are going to adapt the reader page update process where: * 1. We first splice the start and end of list of new pages between * the head page and its previous page. * 2. We cmpxchg the prev_page->next to point from head page to the * start of new pages list. * 3. Finally, we update the head->prev to the end of new list. * * We will try this process 10 times, to make sure that we don't keep * spinning. */ retries = 10; success = false; while (retries--) { struct list_head *head_page, *prev_page; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; struct buffer_page *hpage = rb_set_head_page(cpu_buffer); if (!hpage) break; head_page = &hpage->list; prev_page = head_page->prev; first_page = pages->next; last_page = pages->prev; head_page_with_bit = (struct list_head *) ((unsigned long)head_page | RB_PAGE_HEAD); last_page->next = head_page_with_bit; first_page->prev = prev_page; /* caution: head_page_with_bit gets updated on cmpxchg failure */ if (try_cmpxchg(&prev_page->next, &head_page_with_bit, first_page)) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev * pointer to point to end of list */ head_page->prev = last_page; cpu_buffer->cnt++; success = true; break; } } if (success) INIT_LIST_HEAD(pages); /* * If we weren't successful in adding in new pages, warn and stop * tracing */ RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { struct buffer_page *bpage, *tmp; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return success; } static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) { bool success; if (cpu_buffer->nr_pages_to_update > 0) success = rb_insert_pages(cpu_buffer); else success = rb_remove_pages(cpu_buffer, -cpu_buffer->nr_pages_to_update); if (success) cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; } static void update_pages_handler(struct work_struct *work) { struct ring_buffer_per_cpu *cpu_buffer = container_of(work, struct ring_buffer_per_cpu, update_pages_work); rb_update_pages(cpu_buffer); complete(&cpu_buffer->update_done); } /** * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; /* * Keep CPUs from coming online while resizing to synchronize * with new per CPU buffers being created. */ guard(cpus_read_lock)(); /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } } /* calculate the pages to update */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; /* * nothing more to do for removing pages or no update */ if (cpu_buffer->nr_pages_to_update <= 0) continue; /* * to add pages, make sure all new pages can be * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; } cond_resched(); } /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary * since we can change their buffer sizes without any race. */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; /* Can't run something on an offline CPU. */ if (!cpu_online(cpu)) { rb_update_pages(cpu_buffer); cpu_buffer->nr_pages_to_update = 0; } else { /* Run directly if possible. */ migrate_disable(); if (cpu != smp_processor_id()) { migrate_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); } else { update_pages_handler(&cpu_buffer->update_pages_work); migrate_enable(); } } } /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } } else { cpu_buffer = buffer->buffers[cpu_id]; if (nr_pages == cpu_buffer->nr_pages) goto out; /* * Don't succeed if resizing is disabled, as a reader might be * manipulating the ring buffer and is expecting a sane state while * this is true. */ if (atomic_read(&cpu_buffer->resize_disabled)) { err = -EBUSY; goto out_err_unlock; } cpu_buffer->nr_pages_to_update = nr_pages - cpu_buffer->nr_pages; INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); else { /* Run directly if possible. */ migrate_disable(); if (cpu_id == smp_processor_id()) { rb_update_pages(cpu_buffer); migrate_enable(); } else { migrate_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); } } cpu_buffer->nr_pages_to_update = 0; } out: /* * The ring buffer resize can happen with the ring buffer * enabled, so that the update disturbs the tracing as little * as possible. But if the buffer is disabled, we do not need * to worry about that, and we can take the time to verify * that the buffer is not corrupt. */ if (atomic_read(&buffer->record_disabled)) { atomic_inc(&buffer->record_disabled); /* * Even though the buffer was disabled, we must make sure * that it is truly disabled before calling rb_check_pages. * There could have been a race between checking * record_disable and incrementing it. */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); } atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; out_err: for_each_buffer_cpu(buffer, cpu) { struct buffer_page *bpage, *tmp; cpu_buffer = buffer->buffers[cpu]; cpu_buffer->nr_pages_to_update = 0; if (list_empty(&cpu_buffer->new_pages)) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); cond_resched(); } } out_err_unlock: atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val) { mutex_lock(&buffer->mutex); if (val) buffer->flags |= RB_FL_OVERWRITE; else buffer->flags &= ~RB_FL_OVERWRITE; mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { struct ring_buffer_event *event; struct buffer_page *iter_head_page = iter->head_page; unsigned long commit; unsigned length; if (iter->head != iter->next_event) return iter->event; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update() and __rb_reserve_next()) */ commit = rb_page_commit(iter_head_page); smp_rmb(); /* An event needs to be at least 8 bytes in size */ if (iter->head > commit - 8) goto reset; event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); /* * READ_ONCE() doesn't work on functions and we don't want the * compiler doing any crazy optimizations with length. */ barrier(); if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; memcpy(iter->event, event, length); /* * If the page stamp is still the same after this rmb() then the * event was safely copied without the writer entering the page. */ smp_rmb(); /* Make sure the page didn't change since we read this */ if (iter->page_stamp != iter_head_page->page->time_stamp || commit > rb_page_commit(iter_head_page)) goto reset; iter->next_event = iter->head + length; return iter->event; reset: /* Reset to the beginning */ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; iter->missed_events = 1; return NULL; } /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } static __always_inline unsigned rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* * The iterator could be on the reader page (it starts there). * But the head could have moved, since the reader was * found. Check for this case and assign the iterator * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(&iter->head_page); iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; iter->next_event = 0; } /* Return the index into the sub-buffers for a given sub-buffer */ static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs; subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size); return (subbuf - subbuf_array) / meta->subbuf_size; } static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; rb_inc_page(&next_page); new_head = (unsigned long)next_page->page; /* * Only move it forward once, if something else came in and * moved it forward, then we don't want to touch it. */ (void)cmpxchg(&meta->head_buffer, old_head, new_head); } static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; id = reader->id; cpu_buffer->reader_page->id = id; reader->id = 0; meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader); meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader); /* The head pointer is the one after the reader */ rb_update_meta_head(cpu_buffer, reader); } /* * rb_handle_head_page - writer hit the head page * * Returns: +1 to retry page * 0 to continue * -1 on error */ static int rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { struct buffer_page *new_head; int entries; int type; int ret; entries = rb_page_entries(next_page); /* * The hard part is here. We need to move the head * forward, and protect against both readers on * other CPUs and writers coming in via interrupts. */ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, RB_PAGE_HEAD); /* * type can be one of four: * NORMAL - an interrupt already moved it for us * HEAD - we are the first to get here. * UPDATE - we are the interrupt interrupting * a current move. * MOVED - a reader on another CPU moved the next * pointer to its reader page. Give up * and try again. */ switch (type) { case RB_PAGE_HEAD: /* * We changed the head to UPDATE, thus * it is our responsibility to update * the counters. */ local_add(entries, &cpu_buffer->overrun); local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); if (cpu_buffer->ring_meta) rb_update_meta_head(cpu_buffer, next_page); /* * The entries will be zeroed out when we move the * tail page. */ /* still more to do */ break; case RB_PAGE_UPDATE: /* * This is an interrupt that interrupt the * previous update. Still more to do. */ break; case RB_PAGE_NORMAL: /* * An interrupt came in before the update * and processed this for us. * Nothing left to do. */ return 1; case RB_PAGE_MOVED: /* * The reader is on another CPU and just did * a swap with our next_page. * Try again. */ return 1; default: RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ return -1; } /* * Now that we are here, the old head pointer is * set to UPDATE. This will keep the reader from * swapping the head page with the reader page. * The reader (on another CPU) will spin till * we are finished. * * We just need to protect against interrupts * doing the job. We will set the next pointer * to HEAD. After that, we set the old pointer * to NORMAL, but only if it was HEAD before. * otherwise we are an interrupt, and only * want the outer most commit to reset it. */ new_head = next_page; rb_inc_page(&new_head); ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, RB_PAGE_NORMAL); /* * Valid returns are: * HEAD - an interrupt came in and already set it. * NORMAL - One of two things: * 1) We really set it. * 2) A bunch of interrupts came in and moved * the page forward again. */ switch (ret) { case RB_PAGE_HEAD: case RB_PAGE_NORMAL: /* OK */ break; default: RB_WARN_ON(cpu_buffer, 1); return -1; } /* * It is possible that an interrupt came in, * set the head up, then more interrupts came in * and moved it again. When we get back here, * the page would have been set to NORMAL but we * just set it back to HEAD. * * How do you detect this? Well, if that happened * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { struct buffer_page *buffer_tail_page; buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ if (buffer_tail_page != tail_page && buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); } /* * If this was the outer most commit (the one that * changed the original pointer from HEAD to UPDATE), * then it is up to us to reset it to NORMAL. */ if (type == RB_PAGE_HEAD) { ret = rb_head_page_set_normal(cpu_buffer, next_page, tail_page, RB_PAGE_UPDATE); if (RB_WARN_ON(cpu_buffer, ret != RB_PAGE_UPDATE)) return -1; } return 0; } static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; /* * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); return; } event = __rb_page_index(tail_page, tail); /* * Save the original length to the meta data. * This will be used by the reader to add lost event * counter. */ tail_page->real_end = tail; /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure * that this space is not used again, and this space will * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. */ if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ rb_event_set_padding(event); /* Make sure the padding is visible before the write update */ smp_wmb(); /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; } /* Put in a discarded event */ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ length = (tail + length) - bsize; local_sub(length, &tail_page->write); } static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); /* * This is the slow path, force gcc not to inline it. */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; next_page = tail_page; rb_inc_page(&next_page); /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } /* * This is where the fun begins! * * We are fighting against races between a reader that * could be on another CPU trying to swap its reader * page with the buffer head. * * We are also fighting against interrupts coming in and * moving the head or tail on us as well. * * If the next page is the head page then we have filled * the buffer, unless the commit page is still on the * reader page. */ if (rb_is_head_page(next_page, &tail_page->list)) { /* * If the commit is not on the reader page, then * move the header page. */ if (!rb_is_reader_page(cpu_buffer->commit_page)) { /* * If we are not in overwrite mode, * this is easy, just stop here. */ if (!(buffer->flags & RB_FL_OVERWRITE)) { local_inc(&cpu_buffer->dropped_events); goto out_reset; } ret = rb_handle_head_page(cpu_buffer, tail_page, next_page); if (ret < 0) goto out_reset; if (ret) goto out_again; } else { /* * We need to be careful here too. The * commit page could still be on the reader * page. We could have a small buffer, and * have filled up the buffer with events * from interrupts and such, and wrapped. * * Note, if the tail page is also on the * reader_page, we let it move out. */ if (unlikely((cpu_buffer->commit_page != cpu_buffer->tail_page) && (cpu_buffer->commit_page == cpu_buffer->reader_page))) { local_inc(&cpu_buffer->commit_overrun); goto out_reset; } } } rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); /* Commit what we have for now. */ rb_end_commit(cpu_buffer); /* rb_end_commit() decs committing */ local_inc(&cpu_buffer->committing); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ rb_reset_tail(cpu_buffer, tail, info); return NULL; } /* Slow path */ static struct ring_buffer_event * rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; else event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { /* nope, just zero it */ event->time_delta = 0; event->array[0] = 0; } return skip_time_extend(event); } #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { return true; } #endif static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { u64 write_stamp; WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", (unsigned long long)info->delta, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event **event, struct rb_event_info *info, u64 *delta, unsigned int *length) { bool abs = info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { /* * Some timers can use more than 59 bits, and when a timestamp * is added to the buffer, it will lose those bits. */ if (abs && (info->ts & TS_MSB)) { info->delta &= ABS_TS_MASK; /* did the clock go backwards */ } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; /* * This is possible with a recalibrating of the TSC. * Do not produce a call stack, but just report it. */ if (!once) { once++; pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", info->before, info->ts); } } else rb_check_timestamp(cpu_buffer, info); if (!abs) info->delta = 0; } *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } /** * rb_update_event - update event type and data * @cpu_buffer: The per cpu buffer of the @event * @event: the event to update * @info: The info to update the @event with (contains length and delta) * * Update the type and data fields of the @event. The length * is the actual size that is written to the ring buffer, * and with this, we can determine what to place into the * data field. */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) { unsigned length = info->length; u64 delta = info->delta; unsigned int nest = local_read(&cpu_buffer->committing) - 1; if (!WARN_ON_ONCE(nest >= MAX_NEST)) cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ /* zero length can cause confusions */ if (!length) length++; if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it * in the header, we need to add a timestamp. If another * event comes in when trying to discard this one to increase * the length, then the timestamp will be added in the allocated * space of this event. If length is bigger than the size needed * for the TIME_EXTEND, then padding has to be used. The events * length must be either RB_LEN_TIME_EXTEND, or greater than or equal * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. * As length is a multiple of 4, we only need to worry if it * is 12 (RB_LEN_TIME_EXTEND + 4). */ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) length += RB_ALIGNMENT; return length; } static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long addr; new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); /* * Make sure the tail_page is still the same and * the next write location is the end of this event */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); /* * For the before_stamp to be different than the write_stamp * to make sure that the next event adds an absolute * value and does not rely on the saved write stamp, which * is now going to be bogus. * * By setting the before_stamp to zero, the next event * is not going to use the write_stamp and will instead * create an absolute timestamp. This means there's no * reason to update the wirte_stamp! */ rb_time_set(&cpu_buffer->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the * write stamp for us, and use an absolute timestamp. */ /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ old_index += write_mask; new_index += write_mask; /* caution: old_index gets updated on cmpxchg failure */ if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return true; } } /* could not discard */ return false; } static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit * all others that interrupted us, since the interruptions * are in stack format (they finish before they come * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ again: max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; /* * No need for a memory barrier here, as the update * of the tail_page did it for this page. */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { /* Make sure the readers see the content of what is committed. */ smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->commit_page->page->commit) & ~RB_WRITE_MASK); barrier(); } /* again, keep gcc from optimizing */ barrier(); /* * If an interrupt came in just after the first while loop * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) return; again: commits = local_read(&cpu_buffer->commits); /* synchronize with interrupts */ barrier(); if (local_read(&cpu_buffer->committing) == 1) rb_set_commit_to_write(cpu_buffer); local_dec(&cpu_buffer->committing); /* synchronize with interrupts */ barrier(); /* * Need to account for interrupts coming in between the * updating of the commit page and the clearing of the * committing counter. */ if (unlikely(local_read(&cpu_buffer->commits) != commits) && !local_read(&cpu_buffer->committing)) { local_inc(&cpu_buffer->committing); goto again; } } static inline void rb_event_discard(struct ring_buffer_event *event) { if (extended_time(event)) event = skip_time_extend(event); /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->entries); rb_end_commit(cpu_buffer); } static bool rb_irq_work_queue(struct rb_irq_work *irq_work) { int cpu; /* irq_work_queue_on() is not NMI-safe */ if (unlikely(in_nmi())) return irq_work_queue(&irq_work->work); /* * If CPU isolation is not active, cpu is always the current * CPU, and the following is equivallent to irq_work_queue(). */ cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); return irq_work_queue_on(&irq_work->work, cpu); } static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ rb_irq_work_queue(&buffer->irq_work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ rb_irq_work_queue(&cpu_buffer->irq_work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) return; if (cpu_buffer->reader_page == cpu_buffer->commit_page) return; if (!cpu_buffer->irq_work.full_waiters_pending) return; cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) return; cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ rb_irq_work_queue(&cpu_buffer->irq_work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION # define do_ring_buffer_record_recursion() \ do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) #else # define do_ring_buffer_record_recursion() do { } while (0) #endif /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can * be modified more than once via an interrupt. To pass this * information from the lock to the unlock without having to * access the 'in_interrupt()' functions again (which do show * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * * bit 1 = NMI context * bit 2 = IRQ context * bit 3 = SoftIRQ context * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ * context. * * When the context is determined, the corresponding bit is * checked and set (if it was set, then a recursion of that context * happened). * * On unlock, we need to clear this bit. To do so, just subtract * 1 from the current_context and AND it to itself. * * (binary) * 101 - 1 = 100 * 101 & 100 = 100 (clearing bit zero) * * 1010 - 1 = 1001 * 1010 & 1001 = 1000 (clearing bit 1) * * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. * * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit * is set when a recursion is detected at the current context, and if * the TRANSITION bit is already set, it will fail the recursion. * This is needed because there's a lag between the changing of * interrupt context and updating the preempt count. In this case, * a false positive will be found. To handle this, one extra recursion * is allowed, and this is done by the TRANSITION bit. If the TRANSITION * bit is already set, then it is considered a recursion and the function * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. * * On the trace_recursive_unlock(), the TRANSITION bit will be the first * to be cleared. Even if it wasn't the context that set it. That is, * if an interrupt comes in while NORMAL bit is set and the ring buffer * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will * set the TRANSITION bit and continue. * * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline bool trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* * It is possible that this was called by transitioning * between interrupt context, and preempt_count() has not * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; if (val & (1 << (bit + cpu_buffer->nest))) { do_ring_buffer_record_recursion(); return true; } } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; return false; } static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->current_context &= cpu_buffer->current_context - (1 << cpu_buffer->nest); } /* The recursive locking above uses 5 bits */ #define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested * @buffer: The ring buffer to modify * * The ring buffer has a safety mechanism to prevent recursion. * But there may be a case where a trace needs to be done while * tracing something else. In this case, calling this function * will allow this function to nest within a currently active * ring_buffer_lock_reserve(). * * Call this function before calling another ring_buffer_lock_reserve() and * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). */ void ring_buffer_nest_start(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* Enabled by ring_buffer_nest_end() */ preempt_disable_notrace(); cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest += NESTED_BITS; } /** * ring_buffer_nest_end - Allow to trace while nested * @buffer: The ring buffer to modify * * Must be called after ring_buffer_nest_start() and after the * ring_buffer_unlock_commit(). */ void ring_buffer_nest_end(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* disabled by ring_buffer_nest_start() */ cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* This is the shift value for the above recursive locking */ cpu_buffer->nest -= NESTED_BITS; preempt_enable_notrace(); } /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with ring_buffer_lock_reserve. */ int ring_buffer_unlock_commit(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); return 0; } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); /* Special value to validate all deltas on a page. */ #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS static const char *show_irq_str(int bits) { static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 "Hs", // 3 "n", // 4 "Ns", // 5 "Nh", // 6 "NHs", // 7 }; return type[bits]; } /* Assume this is a trace event */ static const char *show_flags(struct ring_buffer_event *event) { struct trace_entry *entry; int bits = 0; if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) return "X"; entry = ring_buffer_event_data(event); if (entry->flags & TRACE_FLAG_SOFTIRQ) bits |= 1; if (entry->flags & TRACE_FLAG_HARDIRQ) bits |= 2; if (entry->flags & TRACE_FLAG_NMI) bits |= 4; return show_irq_str(bits); } static const char *show_irq(struct ring_buffer_event *event) { struct trace_entry *entry; if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) return ""; entry = ring_buffer_event_data(event); if (entry->flags & TRACE_FLAG_IRQS_OFF) return "d"; return ""; } static const char *show_interrupt_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; if (pc & SOFTIRQ_OFFSET) level |= 1; if (pc & HARDIRQ_MASK) level |= 2; if (pc & NMI_MASK) level |= 4; return show_irq_str(level); } static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) { struct ring_buffer_event *event; u64 ts, delta; int e; ts = bpage->time_stamp; pr_warn(" [%lld] PAGE TIME STAMP\n", ts); for (e = 0; e < tail; e += rb_event_length(event)) { event = (struct ring_buffer_event *)(bpage->data + e); switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", e, ts, event->time_delta, show_flags(event), show_irq(event)); break; default: break; } } pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; #define buffer_warn_return(fmt, ...) \ do { \ /* If another report is happening, ignore this one */ \ if (atomic_inc_return(&ts_dump) != 1) { \ atomic_dec(&ts_dump); \ goto out; \ } \ atomic_inc(&cpu_buffer->record_disabled); \ pr_warn(fmt, ##__VA_ARGS__); \ dump_buffer_page(bpage, info, tail); \ atomic_dec(&ts_dump); \ /* There's some cases in boot up that this can happen */ \ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ /* Do not re-enable checking */ \ return; \ } while (0) /* * Check if the current event time stamp matches the deltas on * the buffer page. */ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { struct buffer_data_page *bpage; u64 ts, delta; bool full = false; int ret; bpage = info->tail_page->page; if (tail == CHECK_FULL_PAGE) { full = true; tail = local_read(&bpage->commit); } else if (info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { /* Ignore events with absolute time stamps */ return; } /* * Do not check the first event (skip possible extends too). * Also do not check if previous events have not been committed. */ if (tail <= 8 || tail > local_read(&bpage->commit)) return; /* * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", cpu_buffer->cpu, ts, delta); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); } #else static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { } #endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) { struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); rb_time_read(&cpu_buffer->before_stamp, &info->before); rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { info->delta = info->ts; } else { /* * If interrupting an event time update, we may need an * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { info->delta = info->ts - info->after; if (unlikely(test_time_stamp(info->delta))) { info->add_timestamp |= RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } } } /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); /*C*/ write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; tail = write - info->length; /* See if we shot pass the end of this buffer page */ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); /* * If something came in between C and D, the write stamp * may now not be in sync. But that's fine as the before_stamp * will be different and then next event will just be forced * to use an absolute timestamp. */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ info->delta = info->ts - info->after; else /* Just use full timestamp for interrupting event */ info->delta = info->ts; check_buffer(cpu_buffer, info, tail); } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make * the next event after this one force using an absolute * timestamp. This is in case an interrupt were to come in * between E and F. */ ts = rb_time_stamp(cpu_buffer->buffer); rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { /* * Nothing came after this event between C and F, it is * safe to use info->after for the delta as it * matched info->before and is still valid. */ info->delta = ts - info->after; } else { /* * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that * came after this will still be correct (as they would * have built their delta on the previous event. */ info->delta = 0; } info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (unlikely(!tail && !(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) info->delta = 0; /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (unlikely(!tail)) tail_page->page->time_stamp = info->ts; /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); return event; } static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; int add_ts_default; /* * ring buffer does cmpxchg as well as atomic64 operations * (which some archs use locking for atomic64), make sure this * is safe in NMI context */ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) && (unlikely(in_nmi()))) { return NULL; } rb_start_commit(cpu_buffer); /* The commit page can not change after this */ #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /* * Due to the ability to swap a cpu buffer from a buffer * it is possible it was swapped before we committed. * (committing stops a swap). We check for it here and * if it happened, we have to fail the write. */ barrier(); if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { local_dec(&cpu_buffer->committing); local_dec(&cpu_buffer->commits); return NULL; } #endif info.length = rb_calculate_event_length(length); if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } again: info.add_timestamp = add_ts_default; info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop * back here. Even with heavy interrupts happening, this * should only happen a few times in a row. If this happens * 1000 times in a row, there must be either an interrupt * storm or we have something buggy. * Bail! */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) { if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) info.length -= RB_LEN_TIME_EXTEND; goto again; } if (likely(event)) return event; out_fail: rb_end_commit(cpu_buffer); return NULL; } /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) * * Returns a reserved event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into * and can use the ring_buffer_event_data() interface. * * The length is the length of the data needed, not the event length * which also includes the event header. * * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int cpu; /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); if (unlikely(atomic_read(&buffer->record_disabled))) goto out; cpu = raw_smp_processor_id(); if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; return event; out_unlock: trace_recursive_unlock(cpu_buffer); out: preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer * to the page it is on. This may only be called before the commit * takes place. */ static inline void rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { local_dec(&bpage->entries); return; } /* * Because the commit page may be on the reader page we * start with the next page and check the end loop there. */ rb_inc_page(&bpage); start = bpage; do { if (bpage->page == (void *)addr) { local_dec(&bpage->entries); return; } rb_inc_page(&bpage); } while (bpage != start); /* commit not part of this buffer?? */ RB_WARN_ON(cpu_buffer, 1); } /** * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * * Sometimes an event that is in the ring buffer needs to be ignored. * This function lets the user discard an event in the ring buffer * and then that event will not be read later. * * This function only works if it is called before the item has been * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event * up as discarded, and perform the commit. * * If this function is called, do not call ring_buffer_unlock_commit on * the event. */ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* The event is discarded regardless */ rb_event_discard(event); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; /* * This must only be called if the event has not been * committed yet. Thus we can assume that preemption * is still disabled. */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. * @length: The length of the data being written (excluding the event header) * @data: The data to write to the buffer. * * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as * one function. If you already have the data to write to the buffer, it * may be easier to simply call this function. * * Note, like ring_buffer_lock_reserve, the length is the length of the data * and not the length of the event which would hold the header. */ int ring_buffer_write(struct trace_buffer *buffer, unsigned long length, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; void *body; int ret = -EBUSY; int cpu; guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) return -EBUSY; if (length > buffer->max_data_size) return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out_unlock; body = rb_event_data(event); memcpy(body, data, length); rb_commit(cpu_buffer); rb_wakeups(buffer, cpu_buffer); ret = 0; out_unlock: trace_recursive_unlock(cpu_buffer); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); /* * The total entries in the ring buffer is the running counter * of entries entered into the ring buffer, minus the sum of * the entries read from the ring buffer and the number of * entries that were overwritten. */ static inline unsigned long rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { return local_read(&cpu_buffer->entries) - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); } static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { return !rb_num_of_entries(cpu_buffer); } /** * ring_buffer_record_disable - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable(struct trace_buffer *buffer) { atomic_inc(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable); /** * ring_buffer_record_enable - enable writes to the buffer * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct trace_buffer *buffer) { atomic_dec(&buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** * ring_buffer_record_off - stop all writes into the buffer * @buffer: The ring buffer to stop writes to. * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; rd = atomic_read(&buffer->record_disabled); do { new_rd = rd | RB_BUFFER_OFF; } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_off); /** * ring_buffer_record_on - restart writes into the buffer * @buffer: The ring buffer to start writes to. * * This enables all writes to the buffer that was disabled by * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct trace_buffer *buffer) { unsigned int rd; unsigned int new_rd; rd = atomic_read(&buffer->record_disabled); do { new_rd = rd & ~RB_BUFFER_OFF; } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_on); /** * ring_buffer_record_is_on - return true if the ring buffer can write * @buffer: The ring buffer to see if write is enabled * * Returns true if the ring buffer is in a state that it accepts writes. */ bool ring_buffer_record_is_on(struct trace_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } /** * ring_buffer_record_is_set_on - return true if the ring buffer is set writable * @buffer: The ring buffer to see if write is set enabled * * Returns true if the ring buffer is set writable by ring_buffer_record_on(). * Note that this does NOT mean it is in a writable state. * * It may return true when the ring buffer has been disabled by * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } /** * ring_buffer_record_is_on_cpu - return true if the ring buffer can write * @buffer: The ring buffer to see if write is enabled * @cpu: The CPU to test if the ring buffer can write too * * Returns true if the ring buffer is in a state that it accepts writes * for a particular CPU. */ bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; cpu_buffer = buffer->buffers[cpu]; return ring_buffer_record_is_set_on(buffer) && !atomic_read(&cpu_buffer->record_disabled); } /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop * * This prevents all writes to the buffer. Any attempt to write * to the buffer after this will fail and return NULL. * * The caller should call synchronize_rcu() after this. */ void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); /** * ring_buffer_record_enable_cpu - enable writes to the buffer * @buffer: The ring buffer to enable writes * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; u64 ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* * if the tail is on reader_page, oldest time stamp is on the reader * page */ if (cpu_buffer->tail_page == cpu_buffer->reader_page) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); if (bpage) ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. */ unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** * ring_buffer_overrun_cpu - get the number of overruns caused by the ring * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by * commits failing due to the buffer wrapping around while there are uncommitted * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->commit_overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); /** * ring_buffer_dropped_events_cpu - get the number of dropped events caused by * the ring buffer filling up (only if RB_FL_OVERWRITE is off). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; ret = local_read(&cpu_buffer->dropped_events); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); /** * ring_buffer_read_events_cpu - get the number of events successfully read * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of events read */ unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; return cpu_buffer->read; } EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * * Returns the total number of entries in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_entries(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long entries = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += rb_num_of_entries(cpu_buffer); } return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); /** * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer * (all CPU entries) */ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long overruns = 0; int cpu; /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += local_read(&cpu_buffer->overrun); } return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; } else { iter->read_stamp = iter->head_page->page->time_stamp; iter->page_stamp = iter->read_stamp; } } /** * ring_buffer_iter_reset - reset an iterator * @iter: The iterator to reset * * Resets the iterator, so that it will start from the beginning * again. */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!iter) return; cpu_buffer = iter->cpu_buffer; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); /** * ring_buffer_iter_empty - check if an iterator has no more to read * @iter: The iterator to check */ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; struct buffer_page *head_page; struct buffer_page *commit_page; struct buffer_page *curr_commit_page; unsigned commit; u64 curr_commit_ts; u64 commit_ts; cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* * When the writer goes across pages, it issues a cmpxchg which * is a mb(), which will synchronize with the rmb here. * (see rb_tail_page_update()) */ smp_rmb(); commit = rb_page_commit(commit_page); /* We want to make sure that the commit page doesn't change */ smp_rmb(); /* Make sure commit page didn't change */ curr_commit_page = READ_ONCE(cpu_buffer->commit_page); curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); /* If the commit page changed, then there's more data */ if (curr_commit_page != commit_page || curr_commit_ts != commit_ts) return 0; /* Still racy, as it may return a false positive, but that's OK */ return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); static void rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; case RINGBUF_TYPE_DATA: cpu_buffer->read_stamp += event->time_delta; return; default: RB_WARN_ON(cpu_buffer, 1); } } static void rb_update_iter_read_stamp(struct ring_buffer_iter *iter, struct ring_buffer_event *event) { u64 delta; switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; case RINGBUF_TYPE_DATA: iter->read_stamp += event->time_delta; return; default: RB_WARN_ON(iter->cpu_buffer, 1); } } static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; bool ret; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); again: /* * This should normally only loop twice. But because the * start of the reader inserts an empty page, it causes * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } reader = cpu_buffer->reader_page; /* If there's more to read, return this page */ if (cpu_buffer->reader_page->read < rb_page_size(reader)) goto out; /* Never should we have an index greater than the size */ if (RB_WARN_ON(cpu_buffer, cpu_buffer->reader_page->read > rb_page_size(reader))) goto out; /* check if we caught up to the tail */ reader = NULL; if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; /* Don't bother swapping if the ring buffer is empty */ if (rb_num_of_entries(cpu_buffer) == 0) goto out; /* * Reset the reader page to size zero. */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); cpu_buffer->reader_page->real_end = 0; spin: /* * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); if (!reader) goto out; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; /* The reader page will be pointing to the new head */ rb_set_list_to_head(&cpu_buffer->reader_page->list); /* * We want to make sure we read the overruns after we set up our * pointers to the next object. The writer side does a * cmpxchg to cross pages which acts as the mb on the writer * side. Note, the reader will constantly fail the swap * while the writer is updating the pointers, so this * guarantees that the overwrite recorded here is the one we * want to compare with the last_overrun. */ smp_mb(); overwrite = local_read(&(cpu_buffer->overrun)); /* * Here's the tricky part. * * We need to move the pointer past the header page. * But we can only do that if a writer is not currently * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* * If we did not convert it, then we must try again. */ if (!ret) goto spin; if (cpu_buffer->ring_meta) rb_update_meta_reader(cpu_buffer, reader); /* * Yay! We succeeded in replacing the page. * * Now make the new head point back to the reader page. */ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(&cpu_buffer->head_page); cpu_buffer->cnt++; local_inc(&cpu_buffer->pages_read); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; cpu_buffer->last_overrun = overwrite; } goto again; out: /* Update the read_stamp on the first event */ if (reader && reader->read == 0) cpu_buffer->read_stamp = reader->page->time_stamp; arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); /* * The writer has preempt disable, wait for it. But not forever * Although, 1 second is pretty much "forever" */ #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); /* Get the latest version of the reader write value */ smp_rmb(); } /* The writer is not moving forward? Something is wrong */ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) reader = NULL; /* * Make sure we see any padding after the write update * (see rb_reset_tail()). * * In addition, a writer may be writing on the reader page * if the page has not been fully filled, so the read barrier * is also needed to make sure we see the content of what is * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); return reader; } static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; struct buffer_page *reader; unsigned length; reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ if (RB_WARN_ON(cpu_buffer, !reader)) return; event = rb_reader_event(cpu_buffer); if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); length = rb_event_length(event); cpu_buffer->reader_page->read += length; cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; cpu_buffer = iter->cpu_buffer; /* If head == next_event then we need to jump to the next event */ if (iter->head == iter->next_event) { /* If the event gets overwritten again, there's nothing to do */ if (rb_iter_head_event(iter) == NULL) return; } iter->head = iter->next_event; /* * Check if we are at the end of the buffer. */ if (iter->next_event >= rb_page_size(iter->head_page)) { /* discarded commits can make the page empty */ if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; } rb_update_iter_read_stamp(iter, iter->event); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) { return cpu_buffer->lost_events; } static struct ring_buffer_event * rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; if (ts) *ts = 0; again: /* * We repeat when a time extend is encountered. * Since the time extend is always attached to a data event, * we should never loop more than once. * (We never hit the following condition more than twice). */ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); if (!reader) return NULL; event = rb_reader_event(cpu_buffer); switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); /* * Because the writer could be discarding every * event it creates (which would probably be bad) * if we were to go back to "again" then we may never * catch up, and will trigger the warn on, or lock * the box. Return the padding, and we will release * the current locks, and try again. */ return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_reader(cpu_buffer); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = cpu_buffer->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } if (lost_events) *lost_events = rb_lost_events(cpu_buffer); return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_peek); static struct ring_buffer_event * rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct trace_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; int nr_loops = 0; if (ts) *ts = 0; cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; /* * Check if someone performed a consuming read to the buffer * or removed some pages from the buffer. In these cases, * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || iter->cache_reader_page != cpu_buffer->reader_page || iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: if (ring_buffer_iter_empty(iter)) return NULL; /* * As the writer can mess with what the iterator is trying * to read, just give up if we fail to get an event after * three tries. The iterator is not as reliable when reading * the ring buffer with an active write as the consumer is. * Do not warn if the three failures is reached. */ if (++nr_loops > 3) return NULL; if (rb_per_cpu_empty(cpu_buffer)) return NULL; if (iter->head >= rb_page_size(iter->head_page)) { rb_inc_iter(iter); goto again; } event = rb_iter_head_event(iter); if (!event) goto again; switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); goto again; } rb_advance_iter(iter); return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } /* Internal data, OK to advance */ rb_advance_iter(iter); goto again; case RINGBUF_TYPE_DATA: if (ts && !(*ts)) { *ts = iter->read_stamp + event->time_delta; ring_buffer_normalize_time_stamp(buffer, cpu_buffer->cpu, ts); } return event; default: RB_WARN_ON(cpu_buffer, 1); } return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) { if (likely(!in_nmi())) { raw_spin_lock(&cpu_buffer->reader_lock); return true; } /* * If an NMI die dumps out the content of the ring buffer * trylock must be used to prevent a deadlock if the NMI * preempted a task that holds the ring buffer locks. If * we get the lock then all is fine, if not, then continue * to do the read, but this can corrupt the ring buffer, * so it must be permanently disabled from future writes. * Reading from NMI is a oneshot deal. */ if (raw_spin_trylock(&cpu_buffer->reader_lock)) return true; /* Continue without locking, but disable the ring buffer */ atomic_inc(&cpu_buffer->record_disabled); return false; } static inline void rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) { if (likely(locked)) raw_spin_unlock(&cpu_buffer->reader_lock); } /** * ring_buffer_peek - peek at the next event to be read * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; unsigned long flags; bool dolock; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; again: local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** ring_buffer_iter_dropped - report if there are dropped events * @iter: The ring buffer iterator * * Returns true if there was dropped events since the last peek. */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { bool ret = iter->missed_events != 0; iter->missed_events = 0; return ret; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); /** * ring_buffer_iter_peek - peek at the next event to be read * @iter: The ring buffer iterator * @ts: The timestamp counter of this event. * * This will return the event that will be read next, but does * not increment the iterator. */ struct ring_buffer_event * ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; struct ring_buffer_event *event; unsigned long flags; again: raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @cpu: the cpu to read the buffer from * @ts: a variable to store the timestamp (may be NULL) * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; unsigned long flags; bool dolock; again: /* might be called in atomic */ preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { cpu_buffer->lost_events = 0; rb_advance_reader(cpu_buffer); } rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); out: preempt_enable(); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); /** * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * * This creates an iterator to allow non-consuming iteration through * the buffer. If the buffer is disabled for writing, it will produce * the same information each time, but if the buffer is still writing * then the first hit of a write will cause the iteration to stop. * * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; iter = kzalloc_obj(*iter, flags); if (!iter) return NULL; /* Holds the entire event: data and meta data */ iter->event_size = buffer->subbuf_size; iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; } cpu_buffer = buffer->buffers[cpu]; iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->resize_disabled); guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables resizing of the buffer, and frees the iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Use this opportunity to check the integrity of the ring buffer. */ rb_check_pages(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); kfree(iter->event); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); /** * ring_buffer_iter_advance - advance the iterator to the next location * @iter: The ring buffer iterator * * Move the location of the iterator such that the next read will * be the next location of the iterator. */ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); /** * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. * @cpu: The CPU to get ring buffer size from. */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); /** * ring_buffer_max_event_size - return the max data size of an event * @buffer: The ring buffer. * * Returns the maximum size an event can be. */ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) { /* If abs timestamp is requested, events have a timestamp too */ if (ring_buffer_time_stamp_abs(buffer)) return buffer->max_data_size - RB_LEN_TIME_EXTEND; return buffer->max_data_size; } EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); local_set(&page->entries, 0); rb_init_page(page->page); page->read = 0; } /* * When the buffer is memory mapped to user space, each sub buffer * has a unique id that is used by the meta data to tell the user * where the current reader page is. * * For a normal allocated ring buffer, the id is saved in the buffer page * id field, and updated via this function. * * But for a fixed memory mapped buffer, the id is already assigned for * fixed memory ordering in the memory layout and can not be used. Instead * the index of where the page lies in the memory layout is used. * * For the normal pages, set the buffer page id with the passed in @id * value and return that. * * For fixed memory mapped pages, get the page index in the memory layout * and return that as the id. */ static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage, int id) { /* * For boot buffers, the id is the index, * otherwise, set the buffer page with this id */ if (cpu_buffer->ring_meta) id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); else bpage->id = id; return id; } static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; if (!meta) return; meta->reader.read = cpu_buffer->reader_page->read; meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, cpu_buffer->reader_page->id); meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); rb_clear_buffer_page(cpu_buffer->head_page); list_for_each_entry(page, cpu_buffer->pages, list) { rb_clear_buffer_page(page); } cpu_buffer->tail_page = cpu_buffer->head_page; cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); rb_clear_buffer_page(cpu_buffer->reader_page); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); local_set(&cpu_buffer->pages_touched, 0); local_set(&cpu_buffer->pages_lost, 0); local_set(&cpu_buffer->pages_read, 0); cpu_buffer->last_pages_touch = 0; cpu_buffer->shortest_full = 0; cpu_buffer->read = 0; cpu_buffer->read_bytes = 0; rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } } /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); } /** * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /* Flag to ensure proper resetting of atomic variables */ #define RESET_BIT (1 << 30) /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_add(RESET_BIT, &cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; /* * If a CPU came online during the synchronize_rcu(), then * ignore it. */ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT)) continue; reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } /** * ring_buffer_reset - reset a ring buffer * @buffer: The ring buffer to reset all cpu buffers */ void ring_buffer_reset(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; int cpu; /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset); /** * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; bool ret; int cpu; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); if (!ret) return false; } return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); /** * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? * @buffer: The ring buffer * @cpu: The CPU buffer to test */ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; bool ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); dolock = rb_reader_lock(cpu_buffer); ret = rb_per_cpu_empty(cpu_buffer); rb_reader_unlock(cpu_buffer, dolock); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with * @buffer_b: The other buffer to swap with * @cpu: the CPU of the buffers to swap * * This function is useful for tracers that want to take a "snapshot" * of a CPU buffer and has another back up buffer lying around. * it is expected that the tracer handles the cpu buffer not being * used at the moment. */ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, struct trace_buffer *buffer_b, int cpu) { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) return -EAGAIN; /* * We can't do a synchronize_rcu here because this * function can be called in atomic context. * Normally this will be called from the same CPU as cpu. * If not it's up to the caller to protect this. */ atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); ret = -EBUSY; if (local_read(&cpu_buffer_a->committing)) goto out_dec; if (local_read(&cpu_buffer_b->committing)) goto out_dec; /* * When resize is in progress, we cannot swap it because * it will mess the state of the cpu buffer. */ if (atomic_read(&buffer_a->resizing)) goto out_dec; if (atomic_read(&buffer_b->resizing)) goto out_dec; buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; ret = 0; out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); #endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions * can be used to speed up the process. The calling function should * allocate a few pages first with this function. Then when it * needs to get pages from the ring buffer, it passes the result * of this function into ring_buffer_read_page, which will swap * the page that was allocated, with the read page of the buffer. * * Returns: * The page allocated, or ERR_PTR */ struct buffer_data_read_page * ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); bpage = kzalloc_obj(*bpage); if (!bpage) return ERR_PTR(-ENOMEM); bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); if (bpage->data) { rb_init_page(bpage->data); } else { bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order); if (!bpage->data) { kfree(bpage); return ERR_PTR(-ENOMEM); } } return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) return; cpu_buffer = buffer->buffers[cpu]; /* * If the page is still in use someplace else, or order of the page * is different from the subbuffer order of the buffer - * we can't reuse it */ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (!cpu_buffer->free_page) { cpu_buffer->free_page = bpage; bpage = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); out: free_pages((unsigned long)bpage, data_page->order); kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * * This function will pull out a page from the ring buffer and consume it. * @data_page must be the address of the variable that was returned * from ring_buffer_alloc_read_page. This is because the page might be used * to swap with a page in the ring buffer. * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) * process_page(ring_buffer_read_page_data(rpage), ret); * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. * * Note: it is up to the calling functions to handle sleeps and wakeups. * The ring buffer can be used anywhere in the kernel and can not * blindly call wake_up. The layer that uses the ring buffer must be * responsible for that. * * Returns: * >=0 if data has been transferred, returns the offset of consumed data. * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_read_page *data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; unsigned int commit; unsigned int read; u64 save_timestamp; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) return -1; if (data_page->order != buffer->subbuf_order) return -1; bpage = data_page->data; if (!bpage) return -1; guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) return -1; event = rb_reader_event(cpu_buffer); read = reader->read; commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; /* * If this page has been partially read or * if len is not big enough to read the rest of the page or * a writer is still on the page, then * we must copy the data from the page to the buffer. * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; unsigned int size; /* * If a full page is expected, this can still be returned * if there's been a previous partial read and the * rest of the page can be read and the commit page is off * the reader page. */ if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) return -1; if (len > (commit - read)) len = (commit - read); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); if (len < size) return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; /* Need to copy one event at a time */ do { /* We need the size of one event, because * rb_advance_reader only advances by one event, * whereas rb_event_ts_length may include the size of * one or two events. * We have already ensured there's enough space if this * is a time extend. */ size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; rb_advance_reader(cpu_buffer); rpos = reader->read; pos += size; if (rpos >= commit) break; event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); bpage->time_stamp = save_timestamp; /* we copied everything to the beginning */ read = 0; } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); bpage = reader->page; reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; data_page->data = bpage; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); } cpu_buffer->lost_events = 0; commit = local_read(&bpage->commit); /* * Set a flag in the commit field if we lost events */ if (missed_events) { /* If there is room at the end of the page to save the * missed events, then record it there. */ if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); commit += sizeof(missed_events); } local_add(RB_MISSED_EVENTS, &bpage->commit); } /* * This page may be off to user land. Zero it out here. */ if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); /** * ring_buffer_read_page_data - get pointer to the data in the page. * @page: the page to get the data from * * Returns pointer to the actual data in this page. */ void *ring_buffer_read_page_data(struct buffer_data_read_page *page) { return page->data; } EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); /** * ring_buffer_subbuf_size_get - get size of the sub buffer. * @buffer: the buffer to get the sub buffer size from * * Returns size of the sub buffer, in bytes. */ int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) { return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); /** * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. * @buffer: The ring_buffer to get the system sub page order from * * By default, one ring buffer sub page equals to one system page. This parameter * is configurable, per ring buffer. The size of the ring buffer sub page can be * extended, but must be an order of system page size. * * Returns the order of buffer sub page size, in system pages: * 0 means the sub buffer size is 1 system page and so forth. * In case of an error < 0 is returned. */ int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) { if (!buffer) return -EINVAL; return buffer->subbuf_order; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); /** * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. * @buffer: The ring_buffer to set the new page size. * @order: Order of the system pages in one sub buffer page * * By default, one ring buffer pages equals to one system page. This API can be * used to set new size of the ring buffer page. The size must be order of * system page size, that's why the input parameter @order is the order of * system pages that are allocated for one ring buffer page: * 0 - 1 system page * 1 - 2 system pages * 3 - 4 system pages * ... * * Returns 0 on success or < 0 in case of an error. */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage, *tmp; int old_order, old_size; int nr_pages; int psize; int err; int cpu; if (!buffer || order < 0) return -EINVAL; if (buffer->subbuf_order == order) return 0; psize = (1 << order) * PAGE_SIZE; if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; /* Size of a subbuf cannot be greater than the write counter */ if (psize > RB_WRITE_MASK + 1) return -EINVAL; old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ synchronize_rcu(); buffer->subbuf_order = order; buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; /* Make sure all new buffers are allocated, before deleting the old ones */ for_each_buffer_cpu(buffer, cpu) { if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; if (cpu_buffer->mapped) { err = -EBUSY; goto error; } /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) nr_pages = 2; cpu_buffer->nr_pages_to_update = nr_pages; /* Include the reader page */ nr_pages++; /* Allocate the new size buffer */ INIT_LIST_HEAD(&cpu_buffer->new_pages); if (__rb_allocate_pages(cpu_buffer, nr_pages, &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto error; } } for_each_buffer_cpu(buffer, cpu) { struct buffer_data_page *old_free_data_page; struct list_head old_pages; unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); /* * Collect buffers from the cpu_buffer pages list and the * reader_page on old_pages, so they can be freed later when not * under a spinlock. The pages list is a linked list with no * head, adding old_pages turns it into a regular list with * old_pages being the head. */ list_add(&old_pages, cpu_buffer->pages); list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; list_del_init(&cpu_buffer->new_pages); cpu_buffer->cnt++; cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); /* Free old sub buffers */ list_for_each_entry_safe(bpage, tmp, &old_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } free_pages((unsigned long)old_free_data_page, old_order); rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); return 0; error: buffer->subbuf_order = old_order; buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!cpu_buffer->nr_pages_to_update) continue; list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { list_del_init(&bpage->list); free_buffer_page(bpage); } } return err; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct page *page; if (cpu_buffer->meta_page) return 0; page = alloc_page(GFP_USER | __GFP_ZERO); if (!page) return -ENOMEM; cpu_buffer->meta_page = page_to_virt(page); return 0; } static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long addr = (unsigned long)cpu_buffer->meta_page; free_page(addr); cpu_buffer->meta_page = NULL; } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, unsigned long *subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; int cnt = 0; int id = 0; id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { id = rb_page_id(cpu_buffer, subbuf, id); if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; rb_inc_page(&subbuf); id++; cnt++; } while (subbuf != first_subbuf); WARN_ON(cnt != nr_subbufs); /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); meta->nr_subbufs = nr_subbufs; meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; meta->meta_page_size = meta->subbuf_size; rb_update_meta_page(cpu_buffer); } static struct ring_buffer_per_cpu * rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-EINVAL); cpu_buffer = buffer->buffers[cpu]; mutex_lock(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { mutex_unlock(&cpu_buffer->mapping_lock); return ERR_PTR(-ENODEV); } return cpu_buffer; } static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) { mutex_unlock(&cpu_buffer->mapping_lock); } /* * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need * to be set-up or torn-down. */ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, bool inc) { unsigned long flags; lockdep_assert_held(&cpu_buffer->mapping_lock); /* mapped is always greater or equal to user_mapped */ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped)) return -EINVAL; if (inc && cpu_buffer->mapped == UINT_MAX) return -EBUSY; if (WARN_ON(!inc && cpu_buffer->user_mapped == 0)) return -EINVAL; mutex_lock(&cpu_buffer->buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (inc) { cpu_buffer->user_mapped++; cpu_buffer->mapped++; } else { cpu_buffer->user_mapped--; cpu_buffer->mapped--; } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); mutex_unlock(&cpu_buffer->buffer->mutex); return 0; } /* * +--------------+ pgoff == 0 * | meta page | * +--------------+ pgoff == 1 * | subbuffer 0 | * | | * +--------------+ pgoff == (1 + (1 << subbuf_order)) * | subbuffer 1 | * | | * ... */ #ifdef CONFIG_MMU static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; /* Refuse MP_PRIVATE or writable mappings */ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || !(vma->vm_flags & VM_MAYSHARE)) return -EPERM; subbuf_order = cpu_buffer->buffer->subbuf_order; subbuf_pages = 1 << subbuf_order; if (subbuf_order && pgoff % subbuf_pages) return -EINVAL; /* * Make sure the mapping cannot become writable later. Also tell the VM * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). */ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ if (nr_pages <= pgoff) return -EINVAL; nr_pages -= pgoff; nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) return -EINVAL; nr_pages = nr_vma_pages; pages = kzalloc_objs(*pages, nr_pages); if (!pages) return -ENOMEM; if (!pgoff) { unsigned long meta_page_padding; pages[p++] = virt_to_page(cpu_buffer->meta_page); /* * Pad with the zero-page to align the meta-page with the * sub-buffers. */ meta_page_padding = subbuf_pages - 1; while (meta_page_padding-- && p < nr_pages) { unsigned long __maybe_unused zero_addr = vma->vm_start + (PAGE_SIZE * p); pages[p++] = ZERO_PAGE(zero_addr); } } else { /* Skip the meta-page */ pgoff -= subbuf_pages; s += pgoff / subbuf_pages; } while (p < nr_pages) { struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) return -EINVAL; page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) break; pages[p++] = page; } s++; } err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); return err; } #else static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct vm_area_struct *vma) { return -EOPNOTSUPP; } #endif int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; int err; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) return err; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); /* * Lock all readers to block any subbuf swap until the subbuf IDs are * assigned. */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); err = __rb_map_vma(cpu_buffer, vma); if (!err) { raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the first time it is mapped by user */ cpu_buffer->mapped++; cpu_buffer->user_mapped = 1; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } else { kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); } return err; } /* * This is called when a VMA is duplicated (e.g., on fork()) to increment * the user_mapped counter without remapping pages. */ void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask))) return; cpu_buffer = buffer->buffers[cpu]; guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) __rb_inc_dec_mapped(cpu_buffer, true); else WARN(1, "Unexpected buffer stat, it should be mapped"); } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); return 0; } guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped)) cpu_buffer->mapped--; cpu_buffer->user_mapped = 0; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *reader; unsigned long missed_events; unsigned long reader_size; unsigned long flags; cpu_buffer = rb_get_mapped_buffer(buffer, cpu); if (IS_ERR(cpu_buffer)) return (int)PTR_ERR(cpu_buffer); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); consume: if (rb_per_cpu_empty(cpu_buffer)) goto out; reader_size = rb_page_size(cpu_buffer->reader_page); /* * There are data to be read on the current reader page, we can * return to the caller. But before that, we assume the latter will read * everything. Let's update the kernel reader accordingly. */ if (cpu_buffer->reader_page->read < reader_size) { while (cpu_buffer->reader_page->read < reader_size) rb_advance_reader(cpu_buffer); goto out; } /* Did the reader catch up with the writer? */ if (cpu_buffer->reader_page == cpu_buffer->commit_page) goto out; reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; if (missed_events) { if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* * Use the real_end for the data size, * This gives us a chance to store the lost events * on the page. */ if (reader->real_end) local_set(&bpage->commit, reader->real_end); /* * If there is room at the end of the page to save the * missed events, then record it there. */ commit = rb_page_size(reader); if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, "Reader on commit with %ld missed events", missed_events)) { /* * There shouldn't be any missed events if the tail_page * is on the reader page. But if the tail page is not on the * reader page and the commit_page is, that would mean that * there's a commit_overrun (an interrupt preempted an * addition of an event and then filled the buffer * with new events). In this case it's not an * error, but it should still be reported. * * TODO: Add missed events to the page for user space to know. */ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } } cpu_buffer->lost_events = 0; goto consume; out: /* Some archs do not have data cache coherency between kernel and user-space */ flush_kernel_vmap_range(cpu_buffer->reader_page->page, buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); rb_put_mapped_buffer(cpu_buffer); return 0; } /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in * the buffer. */ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { struct trace_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; buffer = container_of(node, struct trace_buffer, node); if (cpumask_test_cpu(cpu, buffer->cpumask)) return 0; nr_pages = 0; nr_pages_same = 1; /* check if all cpu sizes are same */ for_each_buffer_cpu(buffer, cpu_i) { /* fill in the size from first enabled cpu */ if (nr_pages == 0) nr_pages = buffer->buffers[cpu_i]->nr_pages; if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { nr_pages_same = 0; break; } } /* allocate minimum pages, user can later expand it */ if (!nr_pages_same) nr_pages = 2; buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %u\n", cpu); return -ENOMEM; } smp_wmb(); cpumask_set_cpu(cpu, buffer->cpumask); return 0; } #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* * This is a basic integrity check of the ring buffer. * Late in the boot cycle this test will run when configured in. * It will kick off a thread per CPU that will go into a loop * writing to the per cpu ring buffer various sizes of data. * Some of the data will be large items, some small. * * Another thread is created that goes into a spin, sending out * IPIs to the other CPUs to also write into the ring buffer. * this is to test the nesting ability of the buffer. * * Basic stats are recorded and reported. If something in the * ring buffer should happen that's not expected, a big warning * is displayed and all ring buffers are disabled. */ static struct task_struct *rb_threads[NR_CPUS] __initdata; struct rb_test_data { struct trace_buffer *buffer; unsigned long events; unsigned long bytes_written; unsigned long bytes_alloc; unsigned long bytes_dropped; unsigned long events_nested; unsigned long bytes_written_nested; unsigned long bytes_alloc_nested; unsigned long bytes_dropped_nested; int min_size_nested; int max_size_nested; int max_size; int min_size; int cpu; int cnt; }; static struct rb_test_data rb_data[NR_CPUS] __initdata; /* 1 meg per cpu */ #define RB_TEST_BUFFER_SIZE 1048576 static char rb_string[] __initdata = "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; static bool rb_test_started __initdata; struct rb_item { int size; char str[]; }; static __init int rb_write_something(struct rb_test_data *data, bool nested) { struct ring_buffer_event *event; struct rb_item *item; bool started; int event_len; int size; int len; int cnt; /* Have nested writes different that what is written */ cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); started = rb_test_started; /* read rb_test_started before checking buffer enabled */ smp_rmb(); event = ring_buffer_lock_reserve(data->buffer, len); if (!event) { /* Ignore dropped events before test starts. */ if (started) { if (nested) data->bytes_dropped_nested += len; else data->bytes_dropped += len; } return len; } event_len = ring_buffer_event_length(event); if (RB_WARN_ON(data->buffer, event_len < len)) goto out; item = ring_buffer_event_data(event); item->size = size; memcpy(item->str, rb_string, size); if (nested) { data->bytes_alloc_nested += event_len; data->bytes_written_nested += len; data->events_nested++; if (!data->min_size_nested || len < data->min_size_nested) data->min_size_nested = len; if (len > data->max_size_nested) data->max_size_nested = len; } else { data->bytes_alloc += event_len; data->bytes_written += len; data->events++; if (!data->min_size || len < data->min_size) data->max_size = len; if (len > data->max_size) data->max_size = len; } out: ring_buffer_unlock_commit(data->buffer); return 0; } static __init int rb_test(void *arg) { struct rb_test_data *data = arg; while (!kthread_should_stop()) { rb_write_something(data, false); data->cnt++; set_current_state(TASK_INTERRUPTIBLE); /* Now sleep between a min of 100-300us and a max of 1ms */ usleep_range(((data->cnt % 3) + 1) * 100, 1000); } return 0; } static __init void rb_ipi(void *ignore) { struct rb_test_data *data; int cpu = smp_processor_id(); data = &rb_data[cpu]; rb_write_something(data, true); } static __init int rb_hammer_test(void *arg) { while (!kthread_should_stop()) { /* Send an IPI to all cpus to write data! */ smp_call_function(rb_ipi, NULL, 1); /* No sleep, but for non preempt, let others run */ schedule(); } return 0; } static __init int test_ringbuffer(void) { struct task_struct *rb_hammer; struct trace_buffer *buffer; int cpu; int ret = 0; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); return 0; } pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); if (WARN_ON(!buffer)) return 0; /* Disable buffer so that threads can't write to it yet */ ring_buffer_record_off(buffer); for_each_online_cpu(cpu) { rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } } /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_hammer); goto out_free; } ring_buffer_record_on(buffer); /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be * dropped and the thread won't catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after * the threads see that the buffer is active. */ smp_wmb(); rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); /* Just run for 10 seconds */ schedule_timeout(10 * HZ); kthread_stop(rb_hammer); out_free: for_each_online_cpu(cpu) { if (!rb_threads[cpu]) break; kthread_stop(rb_threads[cpu]); } if (ret) { ring_buffer_free(buffer); return ret; } /* Report! */ pr_info("finished\n"); for_each_online_cpu(cpu) { struct ring_buffer_event *event; struct rb_test_data *data = &rb_data[cpu]; struct rb_item *item; unsigned long total_events; unsigned long total_dropped; unsigned long total_written; unsigned long total_alloc; unsigned long total_read = 0; unsigned long total_size = 0; unsigned long total_len = 0; unsigned long total_lost = 0; unsigned long lost; int big_event_size; int small_event_size; ret = -1; total_events = data->events + data->events_nested; total_written = data->bytes_written + data->bytes_written_nested; total_alloc = data->bytes_alloc + data->bytes_alloc_nested; total_dropped = data->bytes_dropped + data->bytes_dropped_nested; big_event_size = data->max_size + data->max_size_nested; small_event_size = data->min_size + data->min_size_nested; pr_info("CPU %d:\n", cpu); pr_info(" events: %ld\n", total_events); pr_info(" dropped bytes: %ld\n", total_dropped); pr_info(" alloced bytes: %ld\n", total_alloc); pr_info(" written bytes: %ld\n", total_written); pr_info(" biggest event: %d\n", big_event_size); pr_info(" smallest event: %d\n", small_event_size); if (RB_WARN_ON(buffer, total_dropped)) break; ret = 0; while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { total_lost += lost; item = ring_buffer_event_data(event); total_len += ring_buffer_event_length(event); total_size += item->size + sizeof(struct rb_item); if (memcmp(&item->str[0], rb_string, item->size) != 0) { pr_info("FAILED!\n"); pr_info("buffer had: %.*s\n", item->size, item->str); pr_info("expected: %.*s\n", item->size, rb_string); RB_WARN_ON(buffer, 1); ret = -1; break; } total_read++; } if (ret) break; ret = -1; pr_info(" read events: %ld\n", total_read); pr_info(" lost events: %ld\n", total_lost); pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; } if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) break; ret = 0; } if (!ret) pr_info("Ring buffer PASSED!\n"); ring_buffer_free(buffer); return 0; } late_initcall(test_ringbuffer); #endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
19415 19411 12331 12321 262 42 262 2 261 237 19504 19477 263 263 2 263 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 // SPDX-License-Identifier: GPL-2.0 /* * trace context switch * * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> * */ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/kmemleak.h> #include <linux/ftrace.h> #include <trace/events/sched.h> #include "trace.h" #define RECORD_CMDLINE 1 #define RECORD_TGID 2 static int sched_cmdline_ref; static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { int flags; flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); if (!flags) return; tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) { int ret; ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { unregister_trace_sched_switch(probe_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(int ops) { bool sched_register; mutex_lock(&sched_register_mutex); sched_register = (!sched_cmdline_ref && !sched_tgid_ref); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref++; break; case RECORD_TGID: sched_tgid_ref++; break; } if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref--; break; case RECORD_TGID: sched_tgid_ref--; break; } if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(RECORD_CMDLINE); } void tracing_start_tgid_record(void) { tracing_start_sched_switch(RECORD_TGID); } void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } /* * The tgid_map array maps from pid to tgid; i.e. the value stored at index i * is the tgid last observed corresponding to pid=i. */ static int *tgid_map; /* The maximum valid index into tgid_map. */ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX /* * Preemption must be disabled before acquiring trace_cmdline_lock. * The various trace_arrays' max_lock must be acquired in a context * where interrupt is disabled. */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; /* Holds the size of a cmdline and pid element */ #define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; } static inline void set_cmdline(int idx, const char *cmdline) { strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); kmemleak_free(s); free_pages((unsigned long)s, order); } static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) { struct saved_cmdlines_buffer *s; struct page *page; int orig_size, size; int order; /* Figure out how much is needed to hold the given number of cmdlines */ orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); order = get_order(orig_size); size = 1 << (order + PAGE_SHIFT); page = alloc_pages(GFP_KERNEL, order); if (!page) return NULL; s = page_address(page); kmemleak_alloc(s, size, 1, GFP_KERNEL); memset(s, 0, sizeof(*s)); /* Round up to actual allocation */ val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); s->cmdline_num = val; /* Place map_cmdline_to_pid array right after saved_cmdlines */ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); return s; } int trace_create_savedcmd(void) { savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); return savedcmd ? 0 : -ENOMEM; } int trace_save_cmdline(struct task_struct *tsk) { unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT)); tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. * * This is called within the scheduler and wake up, so interrupts * had better been disabled and run queue lock been held. */ lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); return 1; } static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; int tpid; if (!pid) { strcpy(comm, "<idle>"); return; } if (WARN_ON_ONCE(pid < 0)) { strcpy(comm, "<XXX>"); return; } tpid = pid & (PID_MAX_DEFAULT - 1); map = savedcmd->map_pid_to_cmdline[tpid]; if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) { preempt_disable(); arch_spin_lock(&trace_cmdline_lock); __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int *trace_find_tgid_ptr(int pid) { /* * Pairs with the smp_store_release in set_tracer_flag() to ensure that * if we observe a non-NULL tgid_map then we also observe the correct * tgid_map_max. */ int *map = smp_load_acquire(&tgid_map); if (unlikely(!map || pid > tgid_map_max)) return NULL; return &map[pid]; } int trace_find_tgid(int pid) { int *ptr = trace_find_tgid_ptr(pid); return ptr ? *ptr : 0; } static int trace_save_tgid(struct task_struct *tsk) { int *ptr; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; ptr = trace_find_tgid_ptr(tsk->pid); if (!ptr) return 0; *ptr = tsk->tgid; return 1; } static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; } /** * tracing_record_taskinfo - record the task info of a task * * @task: task to record * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * * @prev: previous task during sched_switch * @next: next task during sched_switch * @flags: TRACE_RECORD_CMDLINE for recording comm * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { bool done; if (tracing_record_taskinfo_skip(flags)) return; /* * Record as much task information as possible. If some fail, continue * to try to record the others. */ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); /* If recording any information failed, retry again soon. */ if (!done) return; __this_cpu_write(trace_taskinfo_save, false); } /* Helpers to record a specific task information */ void tracing_record_cmdline(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); } void tracing_record_tgid(struct task_struct *task) { tracing_record_taskinfo(task, TRACE_RECORD_TGID); } int trace_alloc_tgid_map(void) { int *map; if (tgid_map) return 0; tgid_map_max = init_pid_ns.pid_max; map = kvzalloc_objs(*tgid_map, tgid_map_max + 1); if (!map) return -ENOMEM; /* * Pairs with smp_load_acquire() in * trace_find_tgid_ptr() to ensure that if it observes * the tgid_map we just allocated then it also observes * the corresponding tgid_map_max value. */ smp_store_release(&tgid_map, map); return 0; } static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) { int pid = ++(*pos); return trace_find_tgid_ptr(pid); } static void *saved_tgids_start(struct seq_file *m, loff_t *pos) { int pid = *pos; return trace_find_tgid_ptr(pid); } static void saved_tgids_stop(struct seq_file *m, void *v) { } static int saved_tgids_show(struct seq_file *m, void *v) { int *entry = (int *)v; int pid = entry - tgid_map; int tgid = *entry; if (tgid == 0) return SEQ_SKIP; seq_printf(m, "%d %d\n", pid, tgid); return 0; } static const struct seq_operations tracing_saved_tgids_seq_ops = { .start = saved_tgids_start, .stop = saved_tgids_stop, .next = saved_tgids_next, .show = saved_tgids_show, }; static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_saved_tgids_seq_ops); } const struct file_operations tracing_saved_tgids_fops = { .open = tracing_saved_tgids_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; if (*pos || m->count) ptr++; (*pos)++; for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; ptr++) { if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) continue; return ptr; } return NULL; } static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) { void *v; loff_t l = 0; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); v = &savedcmd->map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); if (!v) return NULL; } return v; } static void saved_cmdlines_stop(struct seq_file *m, void *v) { arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } static int saved_cmdlines_show(struct seq_file *m, void *v) { char buf[TASK_COMM_LEN]; unsigned int *pid = v; __trace_find_cmdline(*pid, buf); seq_printf(m, "%d %s\n", *pid, buf); return 0; } static const struct seq_operations tracing_saved_cmdlines_seq_ops = { .start = saved_cmdlines_start, .next = saved_cmdlines_next, .stop = saved_cmdlines_stop, .show = saved_cmdlines_show, }; static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) { int ret; ret = tracing_check_open_get_tr(NULL); if (ret) return ret; return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_saved_cmdlines_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static ssize_t tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; int r; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } void trace_free_saved_cmdlines_buffer(void) { free_saved_cmdlines_buffer(savedcmd); } static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; } static ssize_t tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; /* must have at least 1 entry or less than PID_MAX_DEFAULT */ if (!val || val > PID_MAX_DEFAULT) return -EINVAL; ret = tracing_resize_saved_cmdlines((unsigned int)val); if (ret < 0) return ret; *ppos += cnt; return cnt; } const struct file_operations tracing_saved_cmdlines_size_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_size_read, .write = tracing_saved_cmdlines_size_write, };
9 5 4 9 5 5 5 2 1 1 1 9 9 11 2 9 9 1 8 8 8 4 1 3 3 3 3 3 3 3 3 5 3 1 1 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 // SPDX-License-Identifier: GPL-2.0-or-later /* GTP according to GSM TS 09.60 / 3GPP TS 29.060 * * (C) 2012-2014 by sysmocom - s.f.m.c. GmbH * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org> * * Author: Harald Welte <hwelte@sysmocom.de> * Pablo Neira Ayuso <pablo@netfilter.org> * Andreas Schultz <aschultz@travelping.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/rculist.h> #include <linux/jhash.h> #include <linux/if_tunnel.h> #include <linux/net.h> #include <linux/file.h> #include <linux/gtp.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <net/gtp.h> /* An active session for the subscriber. */ struct pdp_ctx { struct hlist_node hlist_tid; struct hlist_node hlist_addr; union { struct { u64 tid; u16 flow; } v0; struct { u32 i_tei; u32 o_tei; } v1; } u; u8 gtp_version; u16 af; union { struct in_addr addr; struct in6_addr addr6; } ms; union { struct in_addr addr; struct in6_addr addr6; } peer; struct sock *sk; struct net_device *dev; atomic_t tx_seq; struct rcu_head rcu_head; }; /* One instance of the GTP device. */ struct gtp_dev { struct list_head list; struct sock *sk0; struct sock *sk1u; u8 sk_created; struct net_device *dev; struct net *net; unsigned int role; unsigned int hash_size; struct hlist_head *tid_hash; struct hlist_head *addr_hash; u8 restart_count; }; struct echo_info { u16 af; u8 gtp_version; union { struct in_addr addr; } ms; union { struct in_addr addr; } peer; }; static unsigned int gtp_net_id __read_mostly; struct gtp_net { struct list_head gtp_dev_list; }; static u32 gtp_h_initval; static struct genl_family gtp_genl_family; enum gtp_multicast_groups { GTP_GENL_MCGRP, }; static const struct genl_multicast_group gtp_genl_mcgrps[] = { [GTP_GENL_MCGRP] = { .name = GTP_GENL_MCGRP_NAME }, }; static void pdp_context_delete(struct pdp_ctx *pctx); static inline u32 gtp0_hashfn(u64 tid) { u32 *tid32 = (u32 *) &tid; return jhash_2words(tid32[0], tid32[1], gtp_h_initval); } static inline u32 gtp1u_hashfn(u32 tid) { return jhash_1word(tid, gtp_h_initval); } static inline u32 ipv4_hashfn(__be32 ip) { return jhash_1word((__force u32)ip, gtp_h_initval); } static u32 ipv6_hashfn(const struct in6_addr *ip6) { return jhash_2words((__force u32)ip6->s6_addr32[0], (__force u32)ip6->s6_addr32[1], gtp_h_initval); } /* Resolve a PDP context structure based on the 64bit TID. */ static struct pdp_ctx *gtp0_pdp_find(struct gtp_dev *gtp, u64 tid, u16 family) { struct hlist_head *head; struct pdp_ctx *pdp; head = &gtp->tid_hash[gtp0_hashfn(tid) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_tid) { if (pdp->af == family && pdp->gtp_version == GTP_V0 && pdp->u.v0.tid == tid) return pdp; } return NULL; } /* Resolve a PDP context structure based on the 32bit TEI. */ static struct pdp_ctx *gtp1_pdp_find(struct gtp_dev *gtp, u32 tid, u16 family) { struct hlist_head *head; struct pdp_ctx *pdp; head = &gtp->tid_hash[gtp1u_hashfn(tid) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_tid) { if (pdp->af == family && pdp->gtp_version == GTP_V1 && pdp->u.v1.i_tei == tid) return pdp; } return NULL; } /* Resolve a PDP context based on IPv4 address of MS. */ static struct pdp_ctx *ipv4_pdp_find(struct gtp_dev *gtp, __be32 ms_addr) { struct hlist_head *head; struct pdp_ctx *pdp; head = &gtp->addr_hash[ipv4_hashfn(ms_addr) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_addr) { if (pdp->af == AF_INET && pdp->ms.addr.s_addr == ms_addr) return pdp; } return NULL; } /* 3GPP TS 29.060: PDN Connection: the association between a MS represented by * [...] one IPv6 *prefix* and a PDN represented by an APN. * * Then, 3GPP TS 29.061, Section 11.2.1.3 says: The size of the prefix shall be * according to the maximum prefix length for a global IPv6 address as * specified in the IPv6 Addressing Architecture, see RFC 4291. * * Finally, RFC 4291 section 2.5.4 states: All Global Unicast addresses other * than those that start with binary 000 have a 64-bit interface ID field * (i.e., n + m = 64). */ static bool ipv6_pdp_addr_equal(const struct in6_addr *a, const struct in6_addr *b) { return a->s6_addr32[0] == b->s6_addr32[0] && a->s6_addr32[1] == b->s6_addr32[1]; } static struct pdp_ctx *ipv6_pdp_find(struct gtp_dev *gtp, const struct in6_addr *ms_addr) { struct hlist_head *head; struct pdp_ctx *pdp; head = &gtp->addr_hash[ipv6_hashfn(ms_addr) % gtp->hash_size]; hlist_for_each_entry_rcu(pdp, head, hlist_addr) { if (pdp->af == AF_INET6 && ipv6_pdp_addr_equal(&pdp->ms.addr6, ms_addr)) return pdp; } return NULL; } static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx, unsigned int hdrlen, unsigned int role) { struct iphdr *iph; if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr))) return false; iph = (struct iphdr *)(skb->data + hdrlen); if (role == GTP_ROLE_SGSN) return iph->daddr == pctx->ms.addr.s_addr; else return iph->saddr == pctx->ms.addr.s_addr; } static bool gtp_check_ms_ipv6(struct sk_buff *skb, struct pdp_ctx *pctx, unsigned int hdrlen, unsigned int role) { struct ipv6hdr *ip6h; int ret; if (!pskb_may_pull(skb, hdrlen + sizeof(struct ipv6hdr))) return false; ip6h = (struct ipv6hdr *)(skb->data + hdrlen); if ((ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL) || (ipv6_addr_type(&ip6h->daddr) & IPV6_ADDR_LINKLOCAL)) return false; if (role == GTP_ROLE_SGSN) { ret = ipv6_pdp_addr_equal(&ip6h->daddr, &pctx->ms.addr6); } else { ret = ipv6_pdp_addr_equal(&ip6h->saddr, &pctx->ms.addr6); } return ret; } /* Check if the inner IP address in this packet is assigned to any * existing mobile subscriber. */ static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx, unsigned int hdrlen, unsigned int role, __u16 inner_proto) { switch (inner_proto) { case ETH_P_IP: return gtp_check_ms_ipv4(skb, pctx, hdrlen, role); case ETH_P_IPV6: return gtp_check_ms_ipv6(skb, pctx, hdrlen, role); } return false; } static int gtp_inner_proto(struct sk_buff *skb, unsigned int hdrlen, __u16 *inner_proto) { __u8 *ip_version, _ip_version; ip_version = skb_header_pointer(skb, hdrlen, sizeof(*ip_version), &_ip_version); if (!ip_version) return -1; switch (*ip_version & 0xf0) { case 0x40: *inner_proto = ETH_P_IP; break; case 0x60: *inner_proto = ETH_P_IPV6; break; default: return -1; } return 0; } static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, unsigned int hdrlen, unsigned int role, __u16 inner_proto) { if (!gtp_check_ms(skb, pctx, hdrlen, role, inner_proto)) { netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); return 1; } /* Get rid of the GTP + UDP headers. */ if (iptunnel_pull_header(skb, hdrlen, htons(inner_proto), !net_eq(sock_net(pctx->sk), dev_net(pctx->dev)))) { pctx->dev->stats.rx_length_errors++; goto err; } netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n"); /* Now that the UDP and the GTP header have been removed, set up the * new network header. This is required by the upper layer to * calculate the transport header. */ skb_reset_network_header(skb); skb_reset_mac_header(skb); skb->dev = pctx->dev; dev_sw_netstats_rx_add(pctx->dev, skb->len); __netif_rx(skb); return 0; err: pctx->dev->stats.rx_dropped++; return -1; } static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, const struct sock *sk, __be32 daddr, __be32 saddr) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_dscp = inet_sk_dscp(inet_sk(sk)); fl4->flowi4_scope = ip_sock_rt_scope(sk); fl4->flowi4_proto = sk->sk_protocol; return ip_route_output_key(sock_net(sk), fl4); } static struct rt6_info *ip6_route_output_gtp(struct net *net, struct flowi6 *fl6, const struct sock *sk, const struct in6_addr *daddr, struct in6_addr *saddr) { struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->daddr = *daddr; fl6->saddr = *saddr; fl6->flowi6_proto = sk->sk_protocol; dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, fl6, NULL); if (IS_ERR(dst)) return ERR_PTR(-ENETUNREACH); return (struct rt6_info *)dst; } /* GSM TS 09.60. 7.3 * In all Path Management messages: * - TID: is not used and shall be set to 0. * - Flow Label is not used and shall be set to 0 * In signalling messages: * - number: this field is not yet used in signalling messages. * It shall be set to 255 by the sender and shall be ignored * by the receiver * Returns true if the echo req was correct, false otherwise. */ static bool gtp0_validate_echo_hdr(struct gtp0_header *gtp0) { return !(gtp0->tid || (gtp0->flags ^ 0x1e) || gtp0->number != 0xff || gtp0->flow); } /* msg_type has to be GTP_ECHO_REQ or GTP_ECHO_RSP */ static void gtp0_build_echo_msg(struct gtp0_header *hdr, __u8 msg_type) { int len_pkt, len_hdr; hdr->flags = 0x1e; /* v0, GTP-non-prime. */ hdr->type = msg_type; /* GSM TS 09.60. 7.3 In all Path Management Flow Label and TID * are not used and shall be set to 0. */ hdr->flow = 0; hdr->tid = 0; hdr->number = 0xff; hdr->spare[0] = 0xff; hdr->spare[1] = 0xff; hdr->spare[2] = 0xff; len_pkt = sizeof(struct gtp0_packet); len_hdr = sizeof(struct gtp0_header); if (msg_type == GTP_ECHO_RSP) hdr->length = htons(len_pkt - len_hdr); else hdr->length = 0; } static int gtp0_send_echo_resp_ip(struct gtp_dev *gtp, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4; struct rtable *rt; /* find route to the sender, * src address becomes dst address and vice versa. */ rt = ip4_route_output_gtp(&fl4, gtp->sk0, iph->saddr, iph->daddr); if (IS_ERR(rt)) { netdev_dbg(gtp->dev, "no route for echo response from %pI4\n", &iph->saddr); return -1; } udp_tunnel_xmit_skb(rt, gtp->sk0, skb, fl4.saddr, fl4.daddr, iph->tos, ip4_dst_hoplimit(&rt->dst), 0, htons(GTP0_PORT), htons(GTP0_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), false, 0); return 0; } static int gtp0_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp0_packet *gtp_pkt; struct gtp0_header *gtp0; __be16 seq; gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr)); if (!gtp0_validate_echo_hdr(gtp0)) return -1; seq = gtp0->seq; /* pull GTP and UDP headers */ skb_pull_data(skb, sizeof(struct gtp0_header) + sizeof(struct udphdr)); gtp_pkt = skb_push(skb, sizeof(struct gtp0_packet)); memset(gtp_pkt, 0, sizeof(struct gtp0_packet)); gtp0_build_echo_msg(&gtp_pkt->gtp0_h, GTP_ECHO_RSP); /* GSM TS 09.60. 7.3 The Sequence Number in a signalling response * message shall be copied from the signalling request message * that the GSN is replying to. */ gtp_pkt->gtp0_h.seq = seq; gtp_pkt->ie.tag = GTPIE_RECOVERY; gtp_pkt->ie.val = gtp->restart_count; switch (gtp->sk0->sk_family) { case AF_INET: if (gtp0_send_echo_resp_ip(gtp, skb) < 0) return -1; break; case AF_INET6: return -1; } return 0; } static int gtp_genl_fill_echo(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, int flags, u32 type, struct echo_info echo) { void *genlh; genlh = genlmsg_put(skb, snd_portid, snd_seq, &gtp_genl_family, flags, type); if (!genlh) goto failure; if (nla_put_u32(skb, GTPA_VERSION, echo.gtp_version) || nla_put_be32(skb, GTPA_PEER_ADDRESS, echo.peer.addr.s_addr) || nla_put_be32(skb, GTPA_MS_ADDRESS, echo.ms.addr.s_addr)) goto failure; genlmsg_end(skb, genlh); return 0; failure: genlmsg_cancel(skb, genlh); return -EMSGSIZE; } static void gtp0_handle_echo_resp_ip(struct sk_buff *skb, struct echo_info *echo) { struct iphdr *iph = ip_hdr(skb); echo->ms.addr.s_addr = iph->daddr; echo->peer.addr.s_addr = iph->saddr; echo->gtp_version = GTP_V0; } static int gtp0_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp0_header *gtp0; struct echo_info echo; struct sk_buff *msg; int ret; gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr)); if (!gtp0_validate_echo_hdr(gtp0)) return -1; switch (gtp->sk0->sk_family) { case AF_INET: gtp0_handle_echo_resp_ip(skb, &echo); break; case AF_INET6: return -1; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; ret = gtp_genl_fill_echo(msg, 0, 0, 0, GTP_CMD_ECHOREQ, echo); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(&gtp_genl_family, dev_net(gtp->dev), msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC); } static int gtp_proto_to_family(__u16 proto) { switch (proto) { case ETH_P_IP: return AF_INET; case ETH_P_IPV6: return AF_INET6; default: WARN_ON_ONCE(1); break; } return AF_UNSPEC; } /* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) { unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp0_header); struct gtp0_header *gtp0; struct pdp_ctx *pctx; __u16 inner_proto; if (!pskb_may_pull(skb, hdrlen)) return -1; gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr)); if ((gtp0->flags >> 5) != GTP_V0) return 1; /* If the sockets were created in kernel, it means that * there is no daemon running in userspace which would * handle echo request. */ if (gtp0->type == GTP_ECHO_REQ && gtp->sk_created) return gtp0_send_echo_resp(gtp, skb); if (gtp0->type == GTP_ECHO_RSP && gtp->sk_created) return gtp0_handle_echo_resp(gtp, skb); if (gtp0->type != GTP_TPDU) return 1; if (gtp_inner_proto(skb, hdrlen, &inner_proto) < 0) { netdev_dbg(gtp->dev, "GTP packet does not encapsulate an IP packet\n"); return -1; } pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid), gtp_proto_to_family(inner_proto)); if (!pctx) { netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); return 1; } return gtp_rx(pctx, skb, hdrlen, gtp->role, inner_proto); } /* msg_type has to be GTP_ECHO_REQ or GTP_ECHO_RSP */ static void gtp1u_build_echo_msg(struct gtp1_header_long *hdr, __u8 msg_type) { int len_pkt, len_hdr; /* S flag must be set to 1 */ hdr->flags = 0x32; /* v1, GTP-non-prime. */ hdr->type = msg_type; /* 3GPP TS 29.281 5.1 - TEID has to be set to 0 */ hdr->tid = 0; /* seq, npdu and next should be counted to the length of the GTP packet * that's why size of gtp1_header should be subtracted, * not size of gtp1_header_long. */ len_hdr = sizeof(struct gtp1_header); if (msg_type == GTP_ECHO_RSP) { len_pkt = sizeof(struct gtp1u_packet); hdr->length = htons(len_pkt - len_hdr); } else { /* GTP_ECHO_REQ does not carry GTP Information Element, * the why gtp1_header_long is used here. */ len_pkt = sizeof(struct gtp1_header_long); hdr->length = htons(len_pkt - len_hdr); } } static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp1_header_long *gtp1u; struct gtp1u_packet *gtp_pkt; struct rtable *rt; struct flowi4 fl4; struct iphdr *iph; gtp1u = (struct gtp1_header_long *)(skb->data + sizeof(struct udphdr)); /* 3GPP TS 29.281 5.1 - For the Echo Request, Echo Response, * Error Indication and Supported Extension Headers Notification * messages, the S flag shall be set to 1 and TEID shall be set to 0. */ if (!(gtp1u->flags & GTP1_F_SEQ) || gtp1u->tid) return -1; /* pull GTP and UDP headers */ skb_pull_data(skb, sizeof(struct gtp1_header_long) + sizeof(struct udphdr)); gtp_pkt = skb_push(skb, sizeof(struct gtp1u_packet)); memset(gtp_pkt, 0, sizeof(struct gtp1u_packet)); gtp1u_build_echo_msg(&gtp_pkt->gtp1u_h, GTP_ECHO_RSP); /* 3GPP TS 29.281 7.7.2 - The Restart Counter value in the * Recovery information element shall not be used, i.e. it shall * be set to zero by the sender and shall be ignored by the receiver. * The Recovery information element is mandatory due to backwards * compatibility reasons. */ gtp_pkt->ie.tag = GTPIE_RECOVERY; gtp_pkt->ie.val = 0; iph = ip_hdr(skb); /* find route to the sender, * src address becomes dst address and vice versa. */ rt = ip4_route_output_gtp(&fl4, gtp->sk1u, iph->saddr, iph->daddr); if (IS_ERR(rt)) { netdev_dbg(gtp->dev, "no route for echo response from %pI4\n", &iph->saddr); return -1; } udp_tunnel_xmit_skb(rt, gtp->sk1u, skb, fl4.saddr, fl4.daddr, iph->tos, ip4_dst_hoplimit(&rt->dst), 0, htons(GTP1U_PORT), htons(GTP1U_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), false, 0); return 0; } static int gtp1u_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) { struct gtp1_header_long *gtp1u; struct echo_info echo; struct sk_buff *msg; struct iphdr *iph; int ret; gtp1u = (struct gtp1_header_long *)(skb->data + sizeof(struct udphdr)); /* 3GPP TS 29.281 5.1 - For the Echo Request, Echo Response, * Error Indication and Supported Extension Headers Notification * messages, the S flag shall be set to 1 and TEID shall be set to 0. */ if (!(gtp1u->flags & GTP1_F_SEQ) || gtp1u->tid) return -1; iph = ip_hdr(skb); echo.ms.addr.s_addr = iph->daddr; echo.peer.addr.s_addr = iph->saddr; echo.gtp_version = GTP_V1; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; ret = gtp_genl_fill_echo(msg, 0, 0, 0, GTP_CMD_ECHOREQ, echo); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(&gtp_genl_family, dev_net(gtp->dev), msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC); } static int gtp_parse_exthdrs(struct sk_buff *skb, unsigned int *hdrlen) { struct gtp_ext_hdr *gtp_exthdr, _gtp_exthdr; unsigned int offset = *hdrlen; __u8 *next_type, _next_type; /* From 29.060: "The Extension Header Length field specifies the length * of the particular Extension header in 4 octets units." * * This length field includes length field size itself (1 byte), * payload (variable length) and next type (1 byte). The extension * header is aligned to to 4 bytes. */ do { gtp_exthdr = skb_header_pointer(skb, offset, sizeof(*gtp_exthdr), &_gtp_exthdr); if (!gtp_exthdr || !gtp_exthdr->len) return -1; offset += gtp_exthdr->len * 4; /* From 29.060: "If no such Header follows, then the value of * the Next Extension Header Type shall be 0." */ next_type = skb_header_pointer(skb, offset - 1, sizeof(_next_type), &_next_type); if (!next_type) return -1; } while (*next_type != 0); *hdrlen = offset; return 0; } static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) { unsigned int hdrlen = sizeof(struct udphdr) + sizeof(struct gtp1_header); struct gtp1_header *gtp1; struct pdp_ctx *pctx; __u16 inner_proto; if (!pskb_may_pull(skb, hdrlen)) return -1; gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); if ((gtp1->flags >> 5) != GTP_V1) return 1; /* If the sockets were created in kernel, it means that * there is no daemon running in userspace which would * handle echo request. */ if (gtp1->type == GTP_ECHO_REQ && gtp->sk_created) return gtp1u_send_echo_resp(gtp, skb); if (gtp1->type == GTP_ECHO_RSP && gtp->sk_created) return gtp1u_handle_echo_resp(gtp, skb); if (gtp1->type != GTP_TPDU) return 1; /* From 29.060: "This field shall be present if and only if any one or * more of the S, PN and E flags are set.". * * If any of the bit is set, then the remaining ones also have to be * set. */ if (gtp1->flags & GTP1_F_MASK) hdrlen += 4; /* Make sure the header is larger enough, including extensions. */ if (!pskb_may_pull(skb, hdrlen)) return -1; if (gtp_inner_proto(skb, hdrlen, &inner_proto) < 0) { netdev_dbg(gtp->dev, "GTP packet does not encapsulate an IP packet\n"); return -1; } gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr)); pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid), gtp_proto_to_family(inner_proto)); if (!pctx) { netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb); return 1; } if (gtp1->flags & GTP1_F_EXTHDR && gtp_parse_exthdrs(skb, &hdrlen) < 0) return -1; return gtp_rx(pctx, skb, hdrlen, gtp->role, inner_proto); } static void __gtp_encap_destroy(struct sock *sk) { struct gtp_dev *gtp; lock_sock(sk); gtp = sk->sk_user_data; if (gtp) { if (gtp->sk0 == sk) gtp->sk0 = NULL; else gtp->sk1u = NULL; WRITE_ONCE(udp_sk(sk)->encap_type, 0); rcu_assign_sk_user_data(sk, NULL); release_sock(sk); sock_put(sk); return; } release_sock(sk); } static void gtp_encap_destroy(struct sock *sk) { rtnl_lock(); __gtp_encap_destroy(sk); rtnl_unlock(); } static void gtp_encap_disable_sock(struct sock *sk) { if (!sk) return; __gtp_encap_destroy(sk); } static void gtp_encap_disable(struct gtp_dev *gtp) { if (gtp->sk_created) { udp_tunnel_sock_release(gtp->sk0->sk_socket); udp_tunnel_sock_release(gtp->sk1u->sk_socket); gtp->sk_created = false; gtp->sk0 = NULL; gtp->sk1u = NULL; } else { gtp_encap_disable_sock(gtp->sk0); gtp_encap_disable_sock(gtp->sk1u); } } /* UDP encapsulation receive handler. See net/ipv4/udp.c. * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket. */ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct gtp_dev *gtp; int ret = 0; gtp = rcu_dereference_sk_user_data(sk); if (!gtp) return 1; netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk); switch (READ_ONCE(udp_sk(sk)->encap_type)) { case UDP_ENCAP_GTP0: netdev_dbg(gtp->dev, "received GTP0 packet\n"); ret = gtp0_udp_encap_recv(gtp, skb); break; case UDP_ENCAP_GTP1U: netdev_dbg(gtp->dev, "received GTP1U packet\n"); ret = gtp1u_udp_encap_recv(gtp, skb); break; default: ret = -1; /* Shouldn't happen. */ } switch (ret) { case 1: netdev_dbg(gtp->dev, "pass up to the process\n"); break; case 0: break; case -1: netdev_dbg(gtp->dev, "GTP packet has been dropped\n"); kfree_skb(skb); ret = 0; break; } return ret; } static void gtp_dev_uninit(struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); gtp_encap_disable(gtp); } static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) { int payload_len = skb->len; struct gtp0_header *gtp0; gtp0 = skb_push(skb, sizeof(*gtp0)); gtp0->flags = 0x1e; /* v0, GTP-non-prime. */ gtp0->type = GTP_TPDU; gtp0->length = htons(payload_len); gtp0->seq = htons((atomic_inc_return(&pctx->tx_seq) - 1) % 0xffff); gtp0->flow = htons(pctx->u.v0.flow); gtp0->number = 0xff; gtp0->spare[0] = gtp0->spare[1] = gtp0->spare[2] = 0xff; gtp0->tid = cpu_to_be64(pctx->u.v0.tid); } static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx) { int payload_len = skb->len; struct gtp1_header *gtp1; gtp1 = skb_push(skb, sizeof(*gtp1)); /* Bits 8 7 6 5 4 3 2 1 * +--+--+--+--+--+--+--+--+ * |version |PT| 0| E| S|PN| * +--+--+--+--+--+--+--+--+ * 0 0 1 1 1 0 0 0 */ gtp1->flags = 0x30; /* v1, GTP-non-prime. */ gtp1->type = GTP_TPDU; gtp1->length = htons(payload_len); gtp1->tid = htonl(pctx->u.v1.o_tei); /* TODO: Support for extension header, sequence number and N-PDU. * Update the length field if any of them is available. */ } struct gtp_pktinfo { struct sock *sk; union { struct flowi4 fl4; struct flowi6 fl6; }; union { struct rtable *rt; struct rt6_info *rt6; }; struct pdp_ctx *pctx; struct net_device *dev; __u8 tos; __be16 gtph_port; }; static void gtp_push_header(struct sk_buff *skb, struct gtp_pktinfo *pktinfo) { switch (pktinfo->pctx->gtp_version) { case GTP_V0: pktinfo->gtph_port = htons(GTP0_PORT); gtp0_push_header(skb, pktinfo->pctx); break; case GTP_V1: pktinfo->gtph_port = htons(GTP1U_PORT); gtp1_push_header(skb, pktinfo->pctx); break; } } static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo, struct sock *sk, __u8 tos, struct pdp_ctx *pctx, struct rtable *rt, struct flowi4 *fl4, struct net_device *dev) { pktinfo->sk = sk; pktinfo->tos = tos; pktinfo->pctx = pctx; pktinfo->rt = rt; pktinfo->fl4 = *fl4; pktinfo->dev = dev; } static void gtp_set_pktinfo_ipv6(struct gtp_pktinfo *pktinfo, struct sock *sk, __u8 tos, struct pdp_ctx *pctx, struct rt6_info *rt6, struct flowi6 *fl6, struct net_device *dev) { pktinfo->sk = sk; pktinfo->tos = tos; pktinfo->pctx = pctx; pktinfo->rt6 = rt6; pktinfo->fl6 = *fl6; pktinfo->dev = dev; } static int gtp_build_skb_outer_ip4(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo, struct pdp_ctx *pctx, __u8 tos, __be16 frag_off) { struct rtable *rt; struct flowi4 fl4; __be16 df; int mtu; rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer.addr.s_addr, inet_sk(pctx->sk)->inet_saddr); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to SSGN %pI4\n", &pctx->peer.addr.s_addr); dev->stats.tx_carrier_errors++; goto err; } if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to SSGN %pI4\n", &pctx->peer.addr.s_addr); dev->stats.collisions++; goto err_rt; } /* This is similar to tnl_update_pmtu(). */ df = frag_off; if (df) { mtu = dst_mtu(&rt->dst) - dev->hard_header_len - sizeof(struct iphdr) - sizeof(struct udphdr); switch (pctx->gtp_version) { case GTP_V0: mtu -= sizeof(struct gtp0_header); break; case GTP_V1: mtu -= sizeof(struct gtp1_header); break; } } else { mtu = dst_mtu(&rt->dst); } skb_dst_update_pmtu_no_confirm(skb, mtu); if (frag_off & htons(IP_DF) && ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)))) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto err_rt; } gtp_set_pktinfo_ipv4(pktinfo, pctx->sk, tos, pctx, rt, &fl4, dev); gtp_push_header(skb, pktinfo); return 0; err_rt: ip_rt_put(rt); err: return -EBADMSG; } static int gtp_build_skb_outer_ip6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo, struct pdp_ctx *pctx, __u8 tos) { struct dst_entry *dst; struct rt6_info *rt; struct flowi6 fl6; int mtu; rt = ip6_route_output_gtp(net, &fl6, pctx->sk, &pctx->peer.addr6, &inet6_sk(pctx->sk)->saddr); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to SSGN %pI6\n", &pctx->peer.addr6); dev->stats.tx_carrier_errors++; goto err; } dst = &rt->dst; if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to SSGN %pI6\n", &pctx->peer.addr6); dev->stats.collisions++; goto err_rt; } mtu = dst_mtu(&rt->dst) - dev->hard_header_len - sizeof(struct ipv6hdr) - sizeof(struct udphdr); switch (pctx->gtp_version) { case GTP_V0: mtu -= sizeof(struct gtp0_header); break; case GTP_V1: mtu -= sizeof(struct gtp1_header); break; } skb_dst_update_pmtu_no_confirm(skb, mtu); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); goto err_rt; } gtp_set_pktinfo_ipv6(pktinfo, pctx->sk, tos, pctx, rt, &fl6, dev); gtp_push_header(skb, pktinfo); return 0; err_rt: dst_release(dst); err: return -EBADMSG; } static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo) { struct gtp_dev *gtp = netdev_priv(dev); struct net *net = gtp->net; struct pdp_ctx *pctx; struct iphdr *iph; int ret; /* Read the IP destination address and resolve the PDP context. * Prepend PDP header with TEI/TID from PDP ctx. */ iph = ip_hdr(skb); if (gtp->role == GTP_ROLE_SGSN) pctx = ipv4_pdp_find(gtp, iph->saddr); else pctx = ipv4_pdp_find(gtp, iph->daddr); if (!pctx) { netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", &iph->daddr); return -ENOENT; } netdev_dbg(dev, "found PDP context %p\n", pctx); switch (pctx->sk->sk_family) { case AF_INET: ret = gtp_build_skb_outer_ip4(skb, dev, pktinfo, pctx, iph->tos, iph->frag_off); break; case AF_INET6: ret = gtp_build_skb_outer_ip6(net, skb, dev, pktinfo, pctx, iph->tos); break; default: ret = -1; WARN_ON_ONCE(1); break; } if (ret < 0) return ret; netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n", &iph->saddr, &iph->daddr); return 0; } static int gtp_build_skb_ip6(struct sk_buff *skb, struct net_device *dev, struct gtp_pktinfo *pktinfo) { struct gtp_dev *gtp = netdev_priv(dev); struct net *net = gtp->net; struct pdp_ctx *pctx; struct ipv6hdr *ip6h; __u8 tos; int ret; /* Read the IP destination address and resolve the PDP context. * Prepend PDP header with TEI/TID from PDP ctx. */ ip6h = ipv6_hdr(skb); if (gtp->role == GTP_ROLE_SGSN) pctx = ipv6_pdp_find(gtp, &ip6h->saddr); else pctx = ipv6_pdp_find(gtp, &ip6h->daddr); if (!pctx) { netdev_dbg(dev, "no PDP ctx found for %pI6, skip\n", &ip6h->daddr); return -ENOENT; } netdev_dbg(dev, "found PDP context %p\n", pctx); tos = ipv6_get_dsfield(ip6h); switch (pctx->sk->sk_family) { case AF_INET: ret = gtp_build_skb_outer_ip4(skb, dev, pktinfo, pctx, tos, 0); break; case AF_INET6: ret = gtp_build_skb_outer_ip6(net, skb, dev, pktinfo, pctx, tos); break; default: ret = -1; WARN_ON_ONCE(1); break; } if (ret < 0) return ret; netdev_dbg(dev, "gtp -> IP src: %pI6 dst: %pI6\n", &ip6h->saddr, &ip6h->daddr); return 0; } static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int proto = ntohs(skb->protocol); struct gtp_pktinfo pktinfo; int err; /* Ensure there is sufficient headroom. */ if (skb_cow_head(skb, dev->needed_headroom)) goto tx_err; if (!pskb_inet_may_pull(skb)) goto tx_err; skb_reset_inner_headers(skb); /* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */ rcu_read_lock(); switch (proto) { case ETH_P_IP: err = gtp_build_skb_ip4(skb, dev, &pktinfo); break; case ETH_P_IPV6: err = gtp_build_skb_ip6(skb, dev, &pktinfo); break; default: err = -EOPNOTSUPP; break; } rcu_read_unlock(); if (err < 0) goto tx_err; switch (pktinfo.pctx->sk->sk_family) { case AF_INET: udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb, pktinfo.fl4.saddr, pktinfo.fl4.daddr, pktinfo.tos, ip4_dst_hoplimit(&pktinfo.rt->dst), 0, pktinfo.gtph_port, pktinfo.gtph_port, !net_eq(sock_net(pktinfo.pctx->sk), dev_net(dev)), false, 0); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) udp_tunnel6_xmit_skb(&pktinfo.rt6->dst, pktinfo.sk, skb, dev, &pktinfo.fl6.saddr, &pktinfo.fl6.daddr, pktinfo.tos, ip6_dst_hoplimit(&pktinfo.rt->dst), 0, pktinfo.gtph_port, pktinfo.gtph_port, false, 0); #else goto tx_err; #endif break; } return NETDEV_TX_OK; tx_err: dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } static const struct net_device_ops gtp_netdev_ops = { .ndo_uninit = gtp_dev_uninit, .ndo_start_xmit = gtp_dev_xmit, }; static const struct device_type gtp_type = { .name = "gtp", }; #define GTP_TH_MAXLEN (sizeof(struct udphdr) + sizeof(struct gtp0_header)) #define GTP_IPV4_MAXLEN (sizeof(struct iphdr) + GTP_TH_MAXLEN) static void gtp_link_setup(struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); dev->netdev_ops = &gtp_netdev_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &gtp_type); dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN - GTP_IPV4_MAXLEN; /* Zero header length. */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; netif_keep_dst(dev); dev->needed_headroom = LL_MAX_HEADER + GTP_IPV4_MAXLEN; gtp->dev = dev; } static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize); static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]); static void gtp_destructor(struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); kfree(gtp->addr_hash); kfree(gtp->tid_hash); } static int gtp_sock_udp_config(struct udp_port_cfg *udp_conf, const struct nlattr *nla, int family) { udp_conf->family = family; switch (udp_conf->family) { case AF_INET: udp_conf->local_ip.s_addr = nla_get_be32(nla); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: udp_conf->local_ip6 = nla_get_in6_addr(nla); break; #endif default: return -EOPNOTSUPP; } return 0; } static struct sock *gtp_create_sock(int type, struct gtp_dev *gtp, const struct nlattr *nla, int family) { struct udp_tunnel_sock_cfg tuncfg = {}; struct udp_port_cfg udp_conf = {}; struct net *net = gtp->net; struct socket *sock; int err; if (nla) { err = gtp_sock_udp_config(&udp_conf, nla, family); if (err < 0) return ERR_PTR(err); } else { udp_conf.local_ip.s_addr = htonl(INADDR_ANY); udp_conf.family = AF_INET; } if (type == UDP_ENCAP_GTP0) udp_conf.local_udp_port = htons(GTP0_PORT); else if (type == UDP_ENCAP_GTP1U) udp_conf.local_udp_port = htons(GTP1U_PORT); else return ERR_PTR(-EINVAL); err = udp_sock_create(net, &udp_conf, &sock); if (err) return ERR_PTR(err); tuncfg.sk_user_data = gtp; tuncfg.encap_type = type; tuncfg.encap_rcv = gtp_encap_recv; tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tuncfg); return sock->sk; } static int gtp_create_sockets(struct gtp_dev *gtp, const struct nlattr *nla, int family) { struct sock *sk1u; struct sock *sk0; sk0 = gtp_create_sock(UDP_ENCAP_GTP0, gtp, nla, family); if (IS_ERR(sk0)) return PTR_ERR(sk0); sk1u = gtp_create_sock(UDP_ENCAP_GTP1U, gtp, nla, family); if (IS_ERR(sk1u)) { udp_tunnel_sock_release(sk0->sk_socket); return PTR_ERR(sk1u); } gtp->sk_created = true; gtp->sk0 = sk0; gtp->sk1u = sk1u; return 0; } #define GTP_TH_MAXLEN (sizeof(struct udphdr) + sizeof(struct gtp0_header)) #define GTP_IPV6_MAXLEN (sizeof(struct ipv6hdr) + GTP_TH_MAXLEN) static int gtp_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; unsigned int role = GTP_ROLE_GGSN; struct gtp_dev *gtp; struct gtp_net *gn; int hashsize, err; #if !IS_ENABLED(CONFIG_IPV6) if (data[IFLA_GTP_LOCAL6]) return -EAFNOSUPPORT; #endif gtp = netdev_priv(dev); if (!data[IFLA_GTP_PDP_HASHSIZE]) { hashsize = 1024; } else { hashsize = nla_get_u32(data[IFLA_GTP_PDP_HASHSIZE]); if (!hashsize) hashsize = 1024; } if (data[IFLA_GTP_ROLE]) { role = nla_get_u32(data[IFLA_GTP_ROLE]); if (role > GTP_ROLE_SGSN) return -EINVAL; } gtp->role = role; gtp->restart_count = nla_get_u8_default(data[IFLA_GTP_RESTART_COUNT], 0); gtp->net = link_net; err = gtp_hashtable_new(gtp, hashsize); if (err < 0) return err; if (data[IFLA_GTP_CREATE_SOCKETS]) { if (data[IFLA_GTP_LOCAL6]) err = gtp_create_sockets(gtp, data[IFLA_GTP_LOCAL6], AF_INET6); else err = gtp_create_sockets(gtp, data[IFLA_GTP_LOCAL], AF_INET); } else { err = gtp_encap_enable(gtp, data); } if (err < 0) goto out_hashtable; if ((gtp->sk0 && gtp->sk0->sk_family == AF_INET6) || (gtp->sk1u && gtp->sk1u->sk_family == AF_INET6)) { dev->mtu = ETH_DATA_LEN - GTP_IPV6_MAXLEN; dev->needed_headroom = LL_MAX_HEADER + GTP_IPV6_MAXLEN; } err = register_netdevice(dev); if (err < 0) { netdev_dbg(dev, "failed to register new netdev %d\n", err); goto out_encap; } gn = net_generic(link_net, gtp_net_id); list_add(&gtp->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; netdev_dbg(dev, "registered new GTP interface\n"); return 0; out_encap: gtp_encap_disable(gtp); out_hashtable: kfree(gtp->addr_hash); kfree(gtp->tid_hash); return err; } static void gtp_dellink(struct net_device *dev, struct list_head *head) { struct gtp_dev *gtp = netdev_priv(dev); struct hlist_node *next; struct pdp_ctx *pctx; int i; for (i = 0; i < gtp->hash_size; i++) hlist_for_each_entry_safe(pctx, next, &gtp->tid_hash[i], hlist_tid) pdp_context_delete(pctx); list_del(&gtp->list); unregister_netdevice_queue(dev, head); } static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = { [IFLA_GTP_FD0] = { .type = NLA_U32 }, [IFLA_GTP_FD1] = { .type = NLA_U32 }, [IFLA_GTP_PDP_HASHSIZE] = { .type = NLA_U32 }, [IFLA_GTP_ROLE] = { .type = NLA_U32 }, [IFLA_GTP_CREATE_SOCKETS] = { .type = NLA_U8 }, [IFLA_GTP_RESTART_COUNT] = { .type = NLA_U8 }, [IFLA_GTP_LOCAL] = { .type = NLA_U32 }, [IFLA_GTP_LOCAL6] = { .len = sizeof(struct in6_addr) }, }; static int gtp_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) return -EINVAL; return 0; } static size_t gtp_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_GTP_PDP_HASHSIZE */ nla_total_size(sizeof(__u32)) + /* IFLA_GTP_ROLE */ nla_total_size(sizeof(__u8)); /* IFLA_GTP_RESTART_COUNT */ } static int gtp_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct gtp_dev *gtp = netdev_priv(dev); if (nla_put_u32(skb, IFLA_GTP_PDP_HASHSIZE, gtp->hash_size)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_GTP_ROLE, gtp->role)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_GTP_RESTART_COUNT, gtp->restart_count)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops gtp_link_ops __read_mostly = { .kind = "gtp", .maxtype = IFLA_GTP_MAX, .policy = gtp_policy, .priv_size = sizeof(struct gtp_dev), .setup = gtp_link_setup, .validate = gtp_validate, .newlink = gtp_newlink, .dellink = gtp_dellink, .get_size = gtp_get_size, .fill_info = gtp_fill_info, }; static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize) { int i; gtp->addr_hash = kmalloc_objs(struct hlist_head, hsize, GFP_KERNEL | __GFP_NOWARN); if (gtp->addr_hash == NULL) return -ENOMEM; gtp->tid_hash = kmalloc_objs(struct hlist_head, hsize, GFP_KERNEL | __GFP_NOWARN); if (gtp->tid_hash == NULL) goto err1; gtp->hash_size = hsize; for (i = 0; i < hsize; i++) { INIT_HLIST_HEAD(&gtp->addr_hash[i]); INIT_HLIST_HEAD(&gtp->tid_hash[i]); } return 0; err1: kfree(gtp->addr_hash); return -ENOMEM; } static struct sock *gtp_encap_enable_socket(int fd, int type, struct gtp_dev *gtp) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct socket *sock; struct sock *sk; int err; pr_debug("enable gtp on %d, %d\n", fd, type); sock = sockfd_lookup(fd, &err); if (!sock) { pr_debug("gtp socket fd=%d not found\n", fd); return ERR_PTR(err); } sk = sock->sk; if (sk->sk_protocol != IPPROTO_UDP || sk->sk_type != SOCK_DGRAM || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { pr_debug("socket fd=%d not UDP\n", fd); sk = ERR_PTR(-EINVAL); goto out_sock; } if (sk->sk_family == AF_INET6 && !sk->sk_ipv6only) { sk = ERR_PTR(-EADDRNOTAVAIL); goto out_sock; } lock_sock(sk); if (sk->sk_user_data) { sk = ERR_PTR(-EBUSY); goto out_rel_sock; } sock_hold(sk); tuncfg.sk_user_data = gtp; tuncfg.encap_type = type; tuncfg.encap_rcv = gtp_encap_recv; tuncfg.encap_destroy = gtp_encap_destroy; setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg); out_rel_sock: release_sock(sock->sk); out_sock: sockfd_put(sock); return sk; } static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) { struct sock *sk1u = NULL; struct sock *sk0 = NULL; if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1]) return -EINVAL; if (data[IFLA_GTP_FD0]) { int fd0 = nla_get_u32(data[IFLA_GTP_FD0]); if (fd0 >= 0) { sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp); if (IS_ERR(sk0)) return PTR_ERR(sk0); } } if (data[IFLA_GTP_FD1]) { int fd1 = nla_get_u32(data[IFLA_GTP_FD1]); if (fd1 >= 0) { sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp); if (IS_ERR(sk1u)) { gtp_encap_disable_sock(sk0); return PTR_ERR(sk1u); } } } gtp->sk0 = sk0; gtp->sk1u = sk1u; if (sk0 && sk1u && sk0->sk_family != sk1u->sk_family) { gtp_encap_disable_sock(sk0); gtp_encap_disable_sock(sk1u); return -EINVAL; } return 0; } static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[]) { struct gtp_dev *gtp = NULL; struct net_device *dev; struct net *net; /* Examine the link attributes and figure out which network namespace * we are talking about. */ if (nla[GTPA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(nla[GTPA_NET_NS_FD])); else net = get_net(src_net); if (IS_ERR(net)) return NULL; /* Check if there's an existing gtpX device to configure */ dev = dev_get_by_index_rcu(net, nla_get_u32(nla[GTPA_LINK])); if (dev && dev->netdev_ops == &gtp_netdev_ops) gtp = netdev_priv(dev); put_net(net); return gtp; } static void gtp_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]); switch (pctx->gtp_version) { case GTP_V0: /* According to TS 09.60, sections 7.5.1 and 7.5.2, the flow * label needs to be the same for uplink and downlink packets, * so let's annotate this. */ pctx->u.v0.tid = nla_get_u64(info->attrs[GTPA_TID]); pctx->u.v0.flow = nla_get_u16(info->attrs[GTPA_FLOW]); break; case GTP_V1: pctx->u.v1.i_tei = nla_get_u32(info->attrs[GTPA_I_TEI]); pctx->u.v1.o_tei = nla_get_u32(info->attrs[GTPA_O_TEI]); break; default: break; } } static void ip_pdp_peer_fill(struct pdp_ctx *pctx, struct genl_info *info) { if (info->attrs[GTPA_PEER_ADDRESS]) { pctx->peer.addr.s_addr = nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]); } else if (info->attrs[GTPA_PEER_ADDR6]) { pctx->peer.addr6 = nla_get_in6_addr(info->attrs[GTPA_PEER_ADDR6]); } } static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { ip_pdp_peer_fill(pctx, info); pctx->ms.addr.s_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); gtp_pdp_fill(pctx, info); } static bool ipv6_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { ip_pdp_peer_fill(pctx, info); pctx->ms.addr6 = nla_get_in6_addr(info->attrs[GTPA_MS_ADDR6]); if (pctx->ms.addr6.s6_addr32[2] || pctx->ms.addr6.s6_addr32[3]) return false; gtp_pdp_fill(pctx, info); return true; } static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, struct genl_info *info) { struct pdp_ctx *pctx, *pctx_tid = NULL; struct net_device *dev = gtp->dev; u32 hash_ms, hash_tid = 0; struct in6_addr ms_addr6; unsigned int version; bool found = false; __be32 ms_addr; int family; version = nla_get_u32(info->attrs[GTPA_VERSION]); family = nla_get_u8_default(info->attrs[GTPA_FAMILY], AF_INET); #if !IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) return ERR_PTR(-EAFNOSUPPORT); #endif if (!info->attrs[GTPA_PEER_ADDRESS] && !info->attrs[GTPA_PEER_ADDR6]) return ERR_PTR(-EINVAL); if ((info->attrs[GTPA_PEER_ADDRESS] && sk->sk_family == AF_INET6) || (info->attrs[GTPA_PEER_ADDR6] && sk->sk_family == AF_INET)) return ERR_PTR(-EAFNOSUPPORT); switch (family) { case AF_INET: if (!info->attrs[GTPA_MS_ADDRESS] || info->attrs[GTPA_MS_ADDR6]) return ERR_PTR(-EINVAL); ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size; pctx = ipv4_pdp_find(gtp, ms_addr); break; case AF_INET6: if (!info->attrs[GTPA_MS_ADDR6] || info->attrs[GTPA_MS_ADDRESS]) return ERR_PTR(-EINVAL); ms_addr6 = nla_get_in6_addr(info->attrs[GTPA_MS_ADDR6]); hash_ms = ipv6_hashfn(&ms_addr6) % gtp->hash_size; pctx = ipv6_pdp_find(gtp, &ms_addr6); break; default: return ERR_PTR(-EAFNOSUPPORT); } if (pctx) found = true; if (version == GTP_V0) pctx_tid = gtp0_pdp_find(gtp, nla_get_u64(info->attrs[GTPA_TID]), family); else if (version == GTP_V1) pctx_tid = gtp1_pdp_find(gtp, nla_get_u32(info->attrs[GTPA_I_TEI]), family); if (pctx_tid) found = true; if (found) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) return ERR_PTR(-EEXIST); if (info->nlhdr->nlmsg_flags & NLM_F_REPLACE) return ERR_PTR(-EOPNOTSUPP); if (pctx && pctx_tid) return ERR_PTR(-EEXIST); if (!pctx) pctx = pctx_tid; switch (pctx->af) { case AF_INET: ipv4_pdp_fill(pctx, info); break; case AF_INET6: if (!ipv6_pdp_fill(pctx, info)) return ERR_PTR(-EADDRNOTAVAIL); break; } if (pctx->gtp_version == GTP_V0) netdev_dbg(dev, "GTPv0-U: update tunnel id = %llx (pdp %p)\n", pctx->u.v0.tid, pctx); else if (pctx->gtp_version == GTP_V1) netdev_dbg(dev, "GTPv1-U: update tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); return pctx; } pctx = kmalloc_obj(*pctx, GFP_ATOMIC); if (pctx == NULL) return ERR_PTR(-ENOMEM); sock_hold(sk); pctx->sk = sk; pctx->dev = gtp->dev; pctx->af = family; switch (pctx->af) { case AF_INET: if (!info->attrs[GTPA_MS_ADDRESS]) { sock_put(sk); kfree(pctx); return ERR_PTR(-EINVAL); } ipv4_pdp_fill(pctx, info); break; case AF_INET6: if (!info->attrs[GTPA_MS_ADDR6]) { sock_put(sk); kfree(pctx); return ERR_PTR(-EINVAL); } if (!ipv6_pdp_fill(pctx, info)) { sock_put(sk); kfree(pctx); return ERR_PTR(-EADDRNOTAVAIL); } break; } atomic_set(&pctx->tx_seq, 0); switch (pctx->gtp_version) { case GTP_V0: /* TS 09.60: "The flow label identifies unambiguously a GTP * flow.". We use the tid for this instead, I cannot find a * situation in which this doesn't unambiguosly identify the * PDP context. */ hash_tid = gtp0_hashfn(pctx->u.v0.tid) % gtp->hash_size; break; case GTP_V1: hash_tid = gtp1u_hashfn(pctx->u.v1.i_tei) % gtp->hash_size; break; } hlist_add_head_rcu(&pctx->hlist_addr, &gtp->addr_hash[hash_ms]); hlist_add_head_rcu(&pctx->hlist_tid, &gtp->tid_hash[hash_tid]); switch (pctx->gtp_version) { case GTP_V0: netdev_dbg(dev, "GTPv0-U: new PDP ctx id=%llx ssgn=%pI4 ms=%pI4 (pdp=%p)\n", pctx->u.v0.tid, &pctx->peer.addr, &pctx->ms.addr, pctx); break; case GTP_V1: netdev_dbg(dev, "GTPv1-U: new PDP ctx id=%x/%x ssgn=%pI4 ms=%pI4 (pdp=%p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, &pctx->peer.addr, &pctx->ms.addr, pctx); break; } return pctx; } static void pdp_context_free(struct rcu_head *head) { struct pdp_ctx *pctx = container_of(head, struct pdp_ctx, rcu_head); sock_put(pctx->sk); kfree(pctx); } static void pdp_context_delete(struct pdp_ctx *pctx) { hlist_del_rcu(&pctx->hlist_tid); hlist_del_rcu(&pctx->hlist_addr); call_rcu(&pctx->rcu_head, pdp_context_free); } static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd, gfp_t allocation); static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) { unsigned int version; struct pdp_ctx *pctx; struct gtp_dev *gtp; struct sock *sk; int err; if (!info->attrs[GTPA_VERSION] || !info->attrs[GTPA_LINK]) return -EINVAL; version = nla_get_u32(info->attrs[GTPA_VERSION]); switch (version) { case GTP_V0: if (!info->attrs[GTPA_TID] || !info->attrs[GTPA_FLOW]) return -EINVAL; break; case GTP_V1: if (!info->attrs[GTPA_I_TEI] || !info->attrs[GTPA_O_TEI]) return -EINVAL; break; default: return -EINVAL; } rtnl_lock(); gtp = gtp_find_dev(sock_net(skb->sk), info->attrs); if (!gtp) { err = -ENODEV; goto out_unlock; } if (version == GTP_V0) sk = gtp->sk0; else if (version == GTP_V1) sk = gtp->sk1u; else sk = NULL; if (!sk) { err = -ENODEV; goto out_unlock; } pctx = gtp_pdp_add(gtp, sk, info); if (IS_ERR(pctx)) { err = PTR_ERR(pctx); } else { gtp_tunnel_notify(pctx, GTP_CMD_NEWPDP, GFP_KERNEL); err = 0; } out_unlock: rtnl_unlock(); return err; } static struct pdp_ctx *gtp_find_pdp_by_link(struct net *net, struct nlattr *nla[]) { struct gtp_dev *gtp; int family; family = nla_get_u8_default(nla[GTPA_FAMILY], AF_INET); gtp = gtp_find_dev(net, nla); if (!gtp) return ERR_PTR(-ENODEV); if (nla[GTPA_MS_ADDRESS]) { __be32 ip = nla_get_be32(nla[GTPA_MS_ADDRESS]); if (family != AF_INET) return ERR_PTR(-EINVAL); return ipv4_pdp_find(gtp, ip); } else if (nla[GTPA_MS_ADDR6]) { struct in6_addr addr = nla_get_in6_addr(nla[GTPA_MS_ADDR6]); if (family != AF_INET6) return ERR_PTR(-EINVAL); if (addr.s6_addr32[2] || addr.s6_addr32[3]) return ERR_PTR(-EADDRNOTAVAIL); return ipv6_pdp_find(gtp, &addr); } else if (nla[GTPA_VERSION]) { u32 gtp_version = nla_get_u32(nla[GTPA_VERSION]); if (gtp_version == GTP_V0 && nla[GTPA_TID]) { return gtp0_pdp_find(gtp, nla_get_u64(nla[GTPA_TID]), family); } else if (gtp_version == GTP_V1 && nla[GTPA_I_TEI]) { return gtp1_pdp_find(gtp, nla_get_u32(nla[GTPA_I_TEI]), family); } } return ERR_PTR(-EINVAL); } static struct pdp_ctx *gtp_find_pdp(struct net *net, struct nlattr *nla[]) { struct pdp_ctx *pctx; if (nla[GTPA_LINK]) pctx = gtp_find_pdp_by_link(net, nla); else pctx = ERR_PTR(-EINVAL); if (!pctx) pctx = ERR_PTR(-ENOENT); return pctx; } static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) { struct pdp_ctx *pctx; int err = 0; if (!info->attrs[GTPA_VERSION]) return -EINVAL; rcu_read_lock(); pctx = gtp_find_pdp(sock_net(skb->sk), info->attrs); if (IS_ERR(pctx)) { err = PTR_ERR(pctx); goto out_unlock; } if (pctx->gtp_version == GTP_V0) netdev_dbg(pctx->dev, "GTPv0-U: deleting tunnel id = %llx (pdp %p)\n", pctx->u.v0.tid, pctx); else if (pctx->gtp_version == GTP_V1) netdev_dbg(pctx->dev, "GTPv1-U: deleting tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); gtp_tunnel_notify(pctx, GTP_CMD_DELPDP, GFP_ATOMIC); pdp_context_delete(pctx); out_unlock: rcu_read_unlock(); return err; } static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, int flags, u32 type, struct pdp_ctx *pctx) { void *genlh; genlh = genlmsg_put(skb, snd_portid, snd_seq, &gtp_genl_family, flags, type); if (genlh == NULL) goto nlmsg_failure; if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) || nla_put_u32(skb, GTPA_LINK, pctx->dev->ifindex) || nla_put_u8(skb, GTPA_FAMILY, pctx->af)) goto nla_put_failure; switch (pctx->af) { case AF_INET: if (nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms.addr.s_addr)) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(skb, GTPA_MS_ADDR6, &pctx->ms.addr6)) goto nla_put_failure; break; } switch (pctx->sk->sk_family) { case AF_INET: if (nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer.addr.s_addr)) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(skb, GTPA_PEER_ADDR6, &pctx->peer.addr6)) goto nla_put_failure; break; } switch (pctx->gtp_version) { case GTP_V0: if (nla_put_u64_64bit(skb, GTPA_TID, pctx->u.v0.tid, GTPA_PAD) || nla_put_u16(skb, GTPA_FLOW, pctx->u.v0.flow)) goto nla_put_failure; break; case GTP_V1: if (nla_put_u32(skb, GTPA_I_TEI, pctx->u.v1.i_tei) || nla_put_u32(skb, GTPA_O_TEI, pctx->u.v1.o_tei)) goto nla_put_failure; break; } genlmsg_end(skb, genlh); return 0; nlmsg_failure: nla_put_failure: genlmsg_cancel(skb, genlh); return -EMSGSIZE; } static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd, gfp_t allocation) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, allocation); if (!msg) return -ENOMEM; ret = gtp_genl_fill_info(msg, 0, 0, 0, cmd, pctx); if (ret < 0) { nlmsg_free(msg); return ret; } ret = genlmsg_multicast_netns(&gtp_genl_family, dev_net(pctx->dev), msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC); return ret; } static int gtp_genl_get_pdp(struct sk_buff *skb, struct genl_info *info) { struct pdp_ctx *pctx = NULL; struct sk_buff *skb2; int err; if (!info->attrs[GTPA_VERSION]) return -EINVAL; rcu_read_lock(); pctx = gtp_find_pdp(sock_net(skb->sk), info->attrs); if (IS_ERR(pctx)) { err = PTR_ERR(pctx); goto err_unlock; } skb2 = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) { err = -ENOMEM; goto err_unlock; } err = gtp_genl_fill_info(skb2, NETLINK_CB(skb).portid, info->snd_seq, 0, info->nlhdr->nlmsg_type, pctx); if (err < 0) goto err_unlock_free; rcu_read_unlock(); return genlmsg_unicast(genl_info_net(info), skb2, info->snd_portid); err_unlock_free: kfree_skb(skb2); err_unlock: rcu_read_unlock(); return err; } static int gtp_genl_dump_pdp(struct sk_buff *skb, struct netlink_callback *cb) { struct gtp_dev *last_gtp = (struct gtp_dev *)cb->args[2], *gtp; int i, j, bucket = cb->args[0], skip = cb->args[1]; struct net *net = sock_net(skb->sk); struct net_device *dev; struct pdp_ctx *pctx; if (cb->args[4]) return 0; rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (dev->rtnl_link_ops != &gtp_link_ops) continue; gtp = netdev_priv(dev); if (last_gtp && last_gtp != gtp) continue; else last_gtp = NULL; for (i = bucket; i < gtp->hash_size; i++) { j = 0; hlist_for_each_entry_rcu(pctx, &gtp->tid_hash[i], hlist_tid) { if (j >= skip && gtp_genl_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh->nlmsg_type, pctx)) { cb->args[0] = i; cb->args[1] = j; cb->args[2] = (unsigned long)gtp; goto out; } j++; } skip = 0; } bucket = 0; } cb->args[4] = 1; out: rcu_read_unlock(); return skb->len; } static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *skb_to_send; __be32 src_ip, dst_ip; unsigned int version; struct gtp_dev *gtp; struct flowi4 fl4; struct rtable *rt; struct sock *sk; __be16 port; int len; if (!info->attrs[GTPA_VERSION] || !info->attrs[GTPA_LINK] || !info->attrs[GTPA_PEER_ADDRESS] || !info->attrs[GTPA_MS_ADDRESS]) return -EINVAL; version = nla_get_u32(info->attrs[GTPA_VERSION]); dst_ip = nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]); src_ip = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); gtp = gtp_find_dev(sock_net(skb->sk), info->attrs); if (!gtp) return -ENODEV; if (!gtp->sk_created) return -EOPNOTSUPP; if (!(gtp->dev->flags & IFF_UP)) return -ENETDOWN; if (version == GTP_V0) { struct gtp0_header *gtp0_h; len = LL_RESERVED_SPACE(gtp->dev) + sizeof(struct gtp0_header) + sizeof(struct iphdr) + sizeof(struct udphdr); skb_to_send = netdev_alloc_skb_ip_align(gtp->dev, len); if (!skb_to_send) return -ENOMEM; sk = gtp->sk0; port = htons(GTP0_PORT); gtp0_h = skb_push(skb_to_send, sizeof(struct gtp0_header)); memset(gtp0_h, 0, sizeof(struct gtp0_header)); gtp0_build_echo_msg(gtp0_h, GTP_ECHO_REQ); } else if (version == GTP_V1) { struct gtp1_header_long *gtp1u_h; len = LL_RESERVED_SPACE(gtp->dev) + sizeof(struct gtp1_header_long) + sizeof(struct iphdr) + sizeof(struct udphdr); skb_to_send = netdev_alloc_skb_ip_align(gtp->dev, len); if (!skb_to_send) return -ENOMEM; sk = gtp->sk1u; port = htons(GTP1U_PORT); gtp1u_h = skb_push(skb_to_send, sizeof(struct gtp1_header_long)); memset(gtp1u_h, 0, sizeof(struct gtp1_header_long)); gtp1u_build_echo_msg(gtp1u_h, GTP_ECHO_REQ); } else { return -ENODEV; } rt = ip4_route_output_gtp(&fl4, sk, dst_ip, src_ip); if (IS_ERR(rt)) { netdev_dbg(gtp->dev, "no route for echo request to %pI4\n", &dst_ip); kfree_skb(skb_to_send); return -ENODEV; } udp_tunnel_xmit_skb(rt, sk, skb_to_send, fl4.saddr, fl4.daddr, inet_dscp_to_dsfield(fl4.flowi4_dscp), ip4_dst_hoplimit(&rt->dst), 0, port, port, !net_eq(sock_net(sk), dev_net(gtp->dev)), false, 0); return 0; } static const struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = { [GTPA_LINK] = { .type = NLA_U32, }, [GTPA_VERSION] = { .type = NLA_U32, }, [GTPA_TID] = { .type = NLA_U64, }, [GTPA_PEER_ADDRESS] = { .type = NLA_U32, }, [GTPA_MS_ADDRESS] = { .type = NLA_U32, }, [GTPA_FLOW] = { .type = NLA_U16, }, [GTPA_NET_NS_FD] = { .type = NLA_U32, }, [GTPA_I_TEI] = { .type = NLA_U32, }, [GTPA_O_TEI] = { .type = NLA_U32, }, [GTPA_PEER_ADDR6] = { .len = sizeof(struct in6_addr), }, [GTPA_MS_ADDR6] = { .len = sizeof(struct in6_addr), }, [GTPA_FAMILY] = { .type = NLA_U8, }, }; static const struct genl_small_ops gtp_genl_ops[] = { { .cmd = GTP_CMD_NEWPDP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_new_pdp, .flags = GENL_ADMIN_PERM, }, { .cmd = GTP_CMD_DELPDP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_del_pdp, .flags = GENL_ADMIN_PERM, }, { .cmd = GTP_CMD_GETPDP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_get_pdp, .dumpit = gtp_genl_dump_pdp, .flags = GENL_ADMIN_PERM, }, { .cmd = GTP_CMD_ECHOREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = gtp_genl_send_echo_req, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family gtp_genl_family __ro_after_init = { .name = "gtp", .version = 0, .hdrsize = 0, .maxattr = GTPA_MAX, .policy = gtp_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = gtp_genl_ops, .n_small_ops = ARRAY_SIZE(gtp_genl_ops), .resv_start_op = GTP_CMD_ECHOREQ + 1, .mcgrps = gtp_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(gtp_genl_mcgrps), }; static int __net_init gtp_net_init(struct net *net) { struct gtp_net *gn = net_generic(net, gtp_net_id); INIT_LIST_HEAD(&gn->gtp_dev_list); return 0; } static void __net_exit gtp_net_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct gtp_net *gn = net_generic(net, gtp_net_id); struct gtp_dev *gtp, *gtp_next; list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) gtp_dellink(gtp->dev, dev_to_kill); } static struct pernet_operations gtp_net_ops = { .init = gtp_net_init, .exit_rtnl = gtp_net_exit_rtnl, .id = &gtp_net_id, .size = sizeof(struct gtp_net), }; static int __init gtp_init(void) { int err; get_random_bytes(&gtp_h_initval, sizeof(gtp_h_initval)); err = register_pernet_subsys(&gtp_net_ops); if (err < 0) goto error_out; err = rtnl_link_register(&gtp_link_ops); if (err < 0) goto unreg_pernet_subsys; err = genl_register_family(&gtp_genl_family); if (err < 0) goto unreg_rtnl_link; pr_info("GTP module loaded (pdp ctx size %zd bytes)\n", sizeof(struct pdp_ctx)); return 0; unreg_rtnl_link: rtnl_link_unregister(&gtp_link_ops); unreg_pernet_subsys: unregister_pernet_subsys(&gtp_net_ops); error_out: pr_err("error loading GTP module loaded\n"); return err; } late_initcall(gtp_init); static void __exit gtp_fini(void) { genl_unregister_family(&gtp_genl_family); rtnl_link_unregister(&gtp_link_ops); unregister_pernet_subsys(&gtp_net_ops); pr_info("GTP module unloaded\n"); } module_exit(gtp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <hwelte@sysmocom.de>"); MODULE_DESCRIPTION("Interface driver for GTP encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("gtp"); MODULE_ALIAS_GENL_FAMILY("gtp");
429 429 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_TYPES_H #define _LINUX_MM_TYPES_H #include <linux/mm_types_task.h> #include <linux/auxvec.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/uprobes.h> #include <linux/rcupdate.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <linux/types.h> #include <linux/rseq_types.h> #include <linux/bitmap.h> #include <asm/mmu.h> #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) struct address_space; struct futex_private_hash; struct mem_cgroup; typedef struct { unsigned long f; } memdesc_flags_t; /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * * If you allocate the page using alloc_pages(), you can use some of the * space in struct page for your own purposes. The five words in the main * union are available, except for bit 0 of the first word which must be * kept clear. Many users use this word to store a pointer to an object * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. * * The mapcount field must not be used for own purposes. * * If you want to use the refcount field, it must be used in such a way * that other CPUs temporarily incrementing and then decrementing the * refcount does not cause problems. On receiving the page from * alloc_pages(), the refcount will be positive. * * If you allocate pages of order > 0, you can use some of the fields * in each subpage, but you may need to restore some of their values * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and counters. * That requires that freelist & counters in struct slab be adjacent and * double-word aligned. Because struct slab currently just reinterprets the * bits of struct page, we align all struct pages to double-word boundaries, * and ensure that 'freelist' is aligned within struct slab. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else #define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { memdesc_flags_t flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ union { struct list_head lru; /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; struct llist_node pcp_llist; }; struct address_space *mapping; union { pgoff_t __folio_index; /* Our offset within mapping. */ unsigned long share; /* share count for fsdax */ }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if swapcache flag set. * Indicates order in the buddy system if PageBuddy * or on pcp_llist. */ unsigned long private; }; struct { /* page_pool used by netstack */ /** * @pp_magic: magic value to avoid recycling non * page_pool allocated pages. */ unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /* * The first word is used for compound_head or folio * pgmap */ void *_unused_pgmap_compound_head; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being * mapped so the next 3 words hold the mapping, index, * and private fields from the source anonymous or * page cache page while the page is migrated to device * private memory. * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also * use the mapping, index, and private fields when * pmem backed DAX files are mapped. */ }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; }; union { /* This union is 4 bytes in size. */ /* * For head pages of typed folios, the value stored here * allows for determining what this page is used for. The * tail pages of typed folios will not store a type * (page_type == _mapcount == -1). * * See page-flags.h for a list of page types which are currently * stored here. * * Owners of typed folios may reuse the lower 16 bit of the * head page page_type field after setting the page type, * but must reset these 16 bit to -1 before clearing the * page type. */ unsigned int page_type; /* * For pages that are part of non-typed folios for which mappings * are tracked via the RMAP, encodes the number of times this page * is directly referenced by a page table. * * Note that the mapcount is always initialized to -1, so that * transitions both from it and to it can be tracked, using * atomic_inc_and_test() and atomic_add_negative(-1). */ atomic_t _mapcount; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #elif defined(CONFIG_SLAB_OBJ_EXT) unsigned long _unused_slab_obj_exts; #endif /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif #ifdef CONFIG_KMSAN /* * KMSAN metadata for this page: * - shadow page: every bit indicates whether the corresponding * bit of the original page is initialized (0) or not (1); * - origin page: every 4 bytes contain an id of the stack trace * where the uninitialized value was created. */ struct page *kmsan_shadow; struct page *kmsan_origin; #endif } _struct_page_alignment; /* * struct encoded_page - a nonexistent type marking this pointer * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent * information. Only used in mmu_gather handling, and this acts as a type * system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) * for more. * * Use the supplied helper functions to endcode/decode the pointer and bits. */ struct encoded_page; #define ENCODED_PAGE_BITS 3ul /* Perform rmap removal after we have flushed the TLB. */ #define ENCODED_PAGE_BIT_DELAY_RMAP 1ul /* * The next item in an encoded_page array is the "nr_pages" argument, specifying * the number of consecutive pages starting from this page, that all belong to * the same folio. For example, "nr_pages" corresponds to the number of folio * references that must be dropped. If this bit is not set, "nr_pages" is * implicitly 1. */ #define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } static inline unsigned long encoded_page_flags(struct encoded_page *page) { return ENCODED_PAGE_BITS & (unsigned long)page; } static inline struct page *encoded_page_ptr(struct encoded_page *page) { return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr) { VM_WARN_ON_ONCE((nr << 2) >> 2 != nr); return (struct encoded_page *)(nr << 2); } static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page) { return ((unsigned long)page) >> 2; } /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. */ typedef struct { unsigned long val; } swp_entry_t; /** * typedef softleaf_t - Describes a page table software leaf entry, abstracted * from its architecture-specific encoding. * * Page table leaf entries are those which do not reference any descendent page * tables but rather either reference a data page, are an empty (or 'none' * entry), or contain a non-present entry. * * If referencing another page table or a data page then the page table entry is * pertinent to hardware - that is it tells the hardware how to decode the page * table entry. * * Otherwise it is a software-defined leaf page table entry, which this type * describes. See leafops.h and specifically @softleaf_type for a list of all * possible kinds of software leaf entry. * * A softleaf_t entry is abstracted from the hardware page table entry, so is * not architecture-specific. * * NOTE: While we transition from the confusing swp_entry_t type used for this * purpose, we simply alias this type. This will be removed once the * transition is complete. */ typedef swp_entry_t softleaf_t; #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) /* We have some extra room after the refcount in tail pages. */ #define NR_PAGES_IN_LARGE_FOLIO #endif /* * On 32bit, we can cut the required metadata in half, because: * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have, * so we can limit MM IDs to 15 bit (32767). * (b) We don't expect folios where even a single complete PTE mapping by * one MM would exceed 15 bits (order-15). */ #ifdef CONFIG_64BIT typedef int mm_id_mapcount_t; #define MM_ID_MAPCOUNT_MAX INT_MAX typedef unsigned int mm_id_t; #else /* !CONFIG_64BIT */ typedef short mm_id_mapcount_t; #define MM_ID_MAPCOUNT_MAX SHRT_MAX typedef unsigned short mm_id_t; #endif /* CONFIG_64BIT */ /* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */ #define MM_ID_DUMMY 0 #define MM_ID_MIN (MM_ID_DUMMY + 1) /* * We leave the highest bit of each MM id unused, so we can store a flag * in the highest bit of each folio->_mm_id[]. */ #define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1) #define MM_ID_MASK ((1U << MM_ID_BITS) - 1) #define MM_ID_MAX MM_ID_MASK /* * In order to use bit_spin_lock(), which requires an unsigned long, we * operate on folio->_mm_ids when working on flags. */ #define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS #define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM) #define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1) #define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM) /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. * @lru: Least Recently Used list; tracks how recently this folio was used. * @mlock_count: Number of times this folio has been pinned by mlock(). * @mapping: The file this page belongs to, or refers to the anon_vma for * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. * @share: number of DAX mappings that reference this folio. See * dax_associate_entry. * @private: Filesystem per-folio data (see folio_attach_private()). * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to * find out how many times this folio is mapped by userspace. * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @pgmap: Metadata for ZONE_DEVICE mappings * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_large_mapcount: Do not use directly, call folio_mapcount(). * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_nr_pages: Do not use directly, call folio_nr_pages(). * @_mm_id: Do not use outside of rmap code. * @_mm_ids: Do not use outside of rmap code. * @_mm_id_mapcount: Do not use outside of rmap code. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * @_deferred_list: Folios to be split under memory pressure. * @_unused_slab_obj_exts: Placeholder to match obj_exts in struct slab. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that * same power-of-two. It is at least as large as %PAGE_SIZE. If it is * in the page cache, it is at a file offset which is a multiple of that * power-of-two. It may be mapped into userspace at an address which is * at an arbitrary page offset, but its kernel virtual address is aligned * to its size. */ struct folio { /* private: don't document the anon union */ union { struct { /* public: */ memdesc_flags_t flags; union { struct list_head lru; /* private: avoid cluttering the output */ /* For the Unevictable "LRU list" slot */ struct { /* Avoid compound_head */ void *__filler; /* public: */ unsigned int mlock_count; /* private: */ }; /* public: */ struct dev_pagemap *pgmap; }; struct address_space *mapping; union { pgoff_t index; unsigned long share; }; union { void *private; swp_entry_t swap; }; atomic_t _mapcount; atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #elif defined(CONFIG_SLAB_OBJ_EXT) unsigned long _unused_slab_obj_exts; #endif #if defined(WANT_PAGE_VIRTUAL) void *virtual; #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif /* private: the union with struct page is transitional */ }; struct page page; }; union { struct { unsigned long _flags_1; unsigned long _head_1; union { struct { /* public: */ atomic_t _large_mapcount; atomic_t _nr_pages_mapped; #ifdef CONFIG_64BIT atomic_t _entire_mapcount; atomic_t _pincount; #endif /* CONFIG_64BIT */ mm_id_mapcount_t _mm_id_mapcount[2]; union { mm_id_t _mm_id[2]; unsigned long _mm_ids; }; /* private: the union with struct page is transitional */ }; unsigned long _usable_1[4]; }; atomic_t _mapcount_1; atomic_t _refcount_1; /* public: */ #ifdef NR_PAGES_IN_LARGE_FOLIO unsigned int _nr_pages; #endif /* NR_PAGES_IN_LARGE_FOLIO */ /* private: the union with struct page is transitional */ }; struct page __page_1; }; union { struct { unsigned long _flags_2; unsigned long _head_2; /* public: */ struct list_head _deferred_list; #ifndef CONFIG_64BIT atomic_t _entire_mapcount; atomic_t _pincount; #endif /* !CONFIG_64BIT */ /* private: the union with struct page is transitional */ }; struct page __page_2; }; union { struct { unsigned long _flags_3; unsigned long _head_3; /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; struct page __page_3; }; }; #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(__folio_index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); FOLIO_MATCH(_refcount, _refcount); #ifdef CONFIG_MEMCG FOLIO_MATCH(memcg_data, memcg_data); #endif #if defined(WANT_PAGE_VIRTUAL) FOLIO_MATCH(virtual, virtual); #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS FOLIO_MATCH(_last_cpupid, _last_cpupid); #endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(_mapcount, _mapcount_1); FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 3 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_3); FOLIO_MATCH(compound_head, _head_3); #undef FOLIO_MATCH /** * struct ptdesc - Memory descriptor for page tables. * @pt_flags: enum pt_flags plus zone/node/section. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 gmap shadow pages * (which are not linked into the user page tables) and x86 * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good * understanding of the issues. */ struct ptdesc { memdesc_flags_t pt_flags; union { struct rcu_head pt_rcu_head; struct list_head pt_list; struct { unsigned long _pt_pad_1; pgtable_t pmd_huge_pte; }; }; unsigned long __page_mapping; union { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING atomic_t pt_share_count; #endif }; union { unsigned long _pt_pad_2; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif }; unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif }; #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) TABLE_MATCH(flags, pt_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(__folio_index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif #undef TABLE_MATCH static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); #define ptdesc_page(pt) (_Generic((pt), \ const struct ptdesc *: (const struct page *)(pt), \ struct ptdesc *: (struct page *)(pt))) #define ptdesc_folio(pt) (_Generic((pt), \ const struct ptdesc *: (const struct folio *)(pt), \ struct ptdesc *: (struct folio *)(pt))) #define page_ptdesc(p) (_Generic((p), \ const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { atomic_set(&ptdesc->pt_share_count, 0); } static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) { atomic_inc(&ptdesc->pt_share_count); } static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) { atomic_dec(&ptdesc->pt_share_count); } static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc) { return !!ptdesc_pmd_pts_count(ptdesc); } #else static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { } #endif /* * Used for sizing the vmemmap region on some architectures */ #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) /* * page_private can be used on tail pages. However, PagePrivate is only * checked by the VM on the head page. So page_private on the tail pages * should be used for data that's ancillary to the head page (eg attaching * buffer heads to tail pages after attaching buffer heads to the head page) */ #define page_private(page) ((page)->private) static inline void set_page_private(struct page *page, unsigned long private) { page->private = private; } static inline void *folio_get_private(const struct folio *folio) { return folio->private; } typedef unsigned long vm_flags_t; /* * freeptr_t represents a SLUB freelist pointer, which might be encoded * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. */ typedef struct { unsigned long v; } freeptr_t; /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that * map parts of them. */ struct vm_region { struct rb_node vm_rb; /* link in global region tree */ vm_flags_t vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for * this region */ }; #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) struct vm_userfaultfd_ctx { struct userfaultfd_ctx *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ char name[]; }; #ifdef CONFIG_ANON_VMA_NAME /* * mmap_lock should be read-locked when calling anon_vma_name(). Caller should * either keep holding the lock while using the returned pointer or it should * raise anon_vma_name refcount before releasing the lock. */ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); struct anon_vma_name *anon_vma_name_alloc(const char *name); void anon_vma_name_free(struct kref *kref); #else /* CONFIG_ANON_VMA_NAME */ static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) { return NULL; } #endif /* * While __vma_enter_locked() is working to ensure are no read-locks held on a * VMA (either while acquiring a VMA write lock or marking a VMA detached) we * set the VM_REFCNT_EXCLUDE_READERS_FLAG in vma->vm_refcnt to indiciate to * vma_start_read() that the reference count should be left alone. * * See the comment describing vm_refcnt in vm_area_struct for details as to * which values the VMA reference count can be. */ #define VM_REFCNT_EXCLUDE_READERS_BIT (30) #define VM_REFCNT_EXCLUDE_READERS_FLAG (1U << VM_REFCNT_EXCLUDE_READERS_BIT) #define VM_REFCNT_LIMIT (VM_REFCNT_EXCLUDE_READERS_FLAG - 1) struct vma_numab_state { /* * Initialised as time in 'jiffies' after which VMA * should be scanned. Delays first scan of new VMA by at * least sysctl_numa_balancing_scan_delay: */ unsigned long next_scan; /* * Time in jiffies when pids_active[] is reset to * detect phase change behaviour: */ unsigned long pids_active_reset; /* * Approximate tracking of PIDs that trapped a NUMA hinting * fault. May produce false positives due to hash collisions. * * [0] Previous PID tracking * [1] Current PID tracking * * Window moves after next_pid_reset has expired approximately * every VMA_PID_RESET_PERIOD jiffies: */ unsigned long pids_active[2]; /* MM scan sequence ID when scan first started after VMA creation */ int start_scan_seq; /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq */ int prev_scan_seq; }; #ifdef __HAVE_PFNMAP_TRACKING struct pfnmap_track_ctx { struct kref kref; unsigned long pfn; unsigned long size; /* in bytes */ }; #endif /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ }; /* * Describes an action an mmap_prepare hook can instruct to be taken to complete * the mapping of a VMA. Specified in vm_area_desc. */ struct mmap_action { union { /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; }; enum mmap_action_type type; /* * If specified, this hook is invoked after the selected action has been * successfully completed. Note that the VMA write lock still held. * * The absolute minimum ought to be done here. * * Returns 0 on success, or an error code. */ int (*success_hook)(const struct vm_area_struct *vma); /* * If specified, this hook is invoked when an error occurred when * attempting the selected action. * * The hook can return an error code in order to filter the error, but * it is not valid to clear the error here. */ int (*error_hook)(int err); /* * This should be set in rare instances where the operation required * that the rmap should not be able to access the VMA until * completely set up. */ bool hide_from_rmap_until_complete :1; }; /* * Opaque type representing current VMA (vm_area_struct) flag state. Must be * accessed via vma_flags_xxx() helper functions. */ #define NUM_VMA_FLAG_BITS BITS_PER_LONG typedef struct { DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); } vma_flags_t; #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the * resultant VMA. * * Helper functions are not required for manipulating any field. */ struct vm_area_desc { /* Immutable state. */ const struct mm_struct *const mm; struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; vma_flags_t vma_flags; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; /* Take further action? */ struct mmap_action action; }; /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. * * WARNING: when adding new members, please update vm_area_init_from() to copy * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ union { struct { /* VMA covers [vm_start; vm_end) addresses within mm */ unsigned long vm_start; unsigned long vm_end; }; freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* * The address space we belong to. * Unstable RCU readers are allowed to read this. */ struct mm_struct *vm_mm; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. * Preferably, use vma_flags_xxx() functions. */ union { /* Temporary while VMA flags are being converted. */ const vm_flags_t vm_flags; vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_FLAG is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) * - vm_refcnt bit at VM_REFCNT_EXCLUDE_READERS_BIT is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * * This sequence counter is explicitly allowed to overflow; sequence * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ unsigned int vm_lock_seq; #endif /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ struct list_head anon_vma_chain; /* Serialized by mmap_lock & * page_table_lock */ struct anon_vma *anon_vma; /* Serialized by page_table_lock */ /* Function pointers to deal with this struct. */ const struct vm_operations_struct *vm_ops; /* Information about our backing store: */ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif #ifdef CONFIG_PER_VMA_LOCK /* * Used to keep track of firstly, whether the VMA is attached, secondly, * if attached, how many read locks are taken, and thirdly, if the * VM_REFCNT_EXCLUDE_READERS_FLAG is set, whether any read locks held * are currently in the process of being excluded. * * This value can be equal to: * * 0 - Detached. IMPORTANT: when the refcnt is zero, readers cannot * increment it. * * 1 - Attached and either unlocked or write-locked. Write locks are * identified via __is_vma_write_locked() which checks for equality of * vma->vm_lock_seq and mm->mm_lock_seq. * * >1, < VM_REFCNT_EXCLUDE_READERS_FLAG - Read-locked or (unlikely) * write-locked with other threads having temporarily incremented the * reference count prior to determining it is write-locked and * decrementing it again. * * VM_REFCNT_EXCLUDE_READERS_FLAG - Detached, pending * __vma_end_exclude_readers() completion which will decrement the * reference count to zero. IMPORTANT - at this stage no further readers * can increment the reference count. It can only be reduced. * * VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either write-locking * an attached VMA and has yet to invoke __vma_end_exclude_readers(), * OR a thread is detaching a VMA and is waiting on a single spurious * reader in order to decrement the reference count. IMPORTANT - as * above, no further readers can increment the reference count. * * > VM_REFCNT_EXCLUDE_READERS_FLAG + 1 - A thread is either * write-locking or detaching a VMA is waiting on readers to * exit. IMPORTANT - as above, no further readers can increment the * reference count. * * NOTE: Unstable RCU readers are allowed to read this. */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map vmlock_dep_map; #endif #endif /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * */ struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; #ifdef CONFIG_ANON_VMA_NAME /* * For private and shared anonymous mappings, a pointer to a null * terminated string containing the name given to the vma, or NULL if * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. */ struct anon_vma_name *anon_name; #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef __HAVE_PFNMAP_TRACKING struct pfnmap_track_ctx *pfnmap_track_ctx; #endif } __randomize_layout; /* Clears all bits in the VMA flags bitmap, non-atomically. */ static inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } /* * Copy value to the first system word of VMA flags, non-atomically. * * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) { unsigned long *bitmap = flags->__vma_flags; bitmap[0] = value; } /* * Copy value to the first system word of VMA flags ONCE, non-atomically. * * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) { unsigned long *bitmap = flags->__vma_flags; WRITE_ONCE(*bitmap, value); } /* Update the first system word of VMA flags setting bits, non-atomically. */ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) { unsigned long *bitmap = flags->__vma_flags; *bitmap |= value; } /* Update the first system word of VMA flags clearing bits, non-atomically. */ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) { unsigned long *bitmap = flags->__vma_flags; *bitmap &= ~value; } #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else #define vma_policy(vma) NULL #endif /* * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. */ #define NUM_MM_FLAG_BITS (64) typedef struct { DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } __private mm_flags_t; struct kioctx_table; struct iommu_mm_data; struct mm_struct { struct { /* * Fields which are often written to are placed in a separate * cache line. */ struct { /** * @mm_count: The number of references to &struct * mm_struct (@mm_users count as 1). * * Use mmgrab()/mmdrop() to modify. When this drops to * 0, the &struct mm_struct is freed. */ atomic_t mm_count; } ____cacheline_aligned_in_smp; struct maple_tree mm_mt; unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* Base addresses for compatible mmap() */ unsigned long mmap_compat_base; unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER /** * @membarrier_state: Flags controlling membarrier behavior. * * This field is close to @pgd to hopefully fit in the same * cache-line, which needs to be touched by switch_mm(). */ atomic_t membarrier_state; #endif /** * @mm_users: The number of users including userspace. * * Use mmget()/mmget_not_zero()/mmput() to modify. When this * drops to 0 (i.e. when the task exits and there are no other * temporary reference holders), we also release a reference on * @mm_count (which may then free the &struct mm_struct if * @mm_count also drops to 0). */ atomic_t mm_users; /* MM CID related storage */ struct mm_mm_cid mm_cid; #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some * counters */ /* * Typically the current mmap_lock's offset is 56 bytes from * the last cacheline boundary, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * * So please be careful with adding new fields before * mmap_lock, which can easily push the 2 fields into one * cacheline. */ struct rw_semaphore mmap_lock; struct list_head mmlist; /* List of maybe swapped mm's. These * are globally strung together off * init_mm.mmlist, and are protected * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. * Incremented every time mmap_lock is write-locked/unlocked. * Initialized to 0, therefore odd values indicate mmap_lock * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. * Can be read with no other protection when holding write * mmap_lock. * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ seqcount_t mm_lock_seq; #endif #ifdef CONFIG_FUTEX_PRIVATE_HASH struct mutex futex_hash_lock; struct futex_private_hash __rcu *futex_phash; struct futex_private_hash *futex_phash_new; /* futex-ref */ unsigned long futex_batches; struct rcu_head futex_rcu; atomic_long_t futex_atomic; unsigned int __percpu *futex_ref; #endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ vm_flags_t def_flags; /** * @write_protect_seq: Locked when any thread is write * protecting pages mapped by this mm to enforce a later COW, * for instance during page table copying for fork(). */ seqcount_t write_protect_seq; spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ #ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS /* the ABI-related flags from the ELF header. Used for core dump */ unsigned long saved_e_flags; #endif struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; /* Architecture-specific MM context */ mm_context_t context; mm_flags_t flags; /* Must use mm_flags_* hlpers to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ struct task_struct __rcu *owner; #endif struct user_namespace *user_ns; /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that PTEs will be remapped * PROT_NONE to trigger NUMA hinting faults; such faults gather * statistics and migrate pages to new nodes if necessary. */ unsigned long numa_next_scan; /* Restart point for scanning and remapping PTEs. */ unsigned long numa_scan_offset; /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when * moving a PROT_NONE mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT struct rcu_head delayed_drop; #endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; #ifdef CONFIG_IOMMU_MM_DATA struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM * merging (not including ksm_zero_pages). */ unsigned long ksm_merging_pages; /* * Represent how many pages are checked for ksm merging * including merged and not merged. */ unsigned long ksm_rmap_items; /* * Represent how many empty pages are merged with kernel zero * pages when enabling KSM use_zero_pages. */ atomic_long_t ksm_zero_pages; #endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; /* * Set when switching to this mm_struct, as a hint of * whether it has been used since the last time per-node * page table walkers cleared the corresponding bits. */ unsigned long bitmap; #ifdef CONFIG_MEMCG /* points to the memcg of "owner" above */ struct mem_cgroup *memcg; #endif } lru_gen; #endif /* CONFIG_LRU_GEN_WALKS_MMU */ #ifdef CONFIG_MM_ID mm_id_t mm_id; #endif /* CONFIG_MM_ID */ } __randomize_layout; /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ char flexible_array[] __aligned(__alignof__(unsigned long)); }; /* Copy value to the first system word of mm flags, non-atomically. */ static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value) { *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value; } /* Obtain a read-only view of the mm flags bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); } /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) { return *__mm_flags_get_bitmap(mm); } /* * Update the first system word of mm flags ONLY, applying the specified mask to * it, then setting all flags specified by bits. */ static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, unsigned long mask, unsigned long bits) { unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); set_mask_bits(bitmap, mask, bits); } #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; #define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ { \ [0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0 \ } /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { unsigned long cpu_bitmap = (unsigned long)mm; cpu_bitmap += offsetof(struct mm_struct, flexible_array); cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { return (struct cpumask *)&mm->flexible_array; } #ifdef CONFIG_LRU_GEN struct lru_gen_mm_list { /* mm_struct list for page table walkers */ struct list_head fifo; /* protects the list above */ spinlock_t lock; }; #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_LRU_GEN_WALKS_MMU void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); void lru_gen_migrate_mm(struct mm_struct *mm); static inline void lru_gen_init_mm(struct mm_struct *mm) { INIT_LIST_HEAD(&mm->lru_gen.list); mm->lru_gen.bitmap = 0; #ifdef CONFIG_MEMCG mm->lru_gen.memcg = NULL; #endif } static inline void lru_gen_use_mm(struct mm_struct *mm) { /* * When the bitmap is set, page reclaim knows this mm_struct has been * used since the last time it cleared the bitmap. So it might be worth * walking the page tables of this mm_struct to clear the accessed bit. */ WRITE_ONCE(mm->lru_gen.bitmap, -1); } #else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { } static inline void lru_gen_del_mm(struct mm_struct *mm) { } static inline void lru_gen_migrate_mm(struct mm_struct *mm) { } static inline void lru_gen_init_mm(struct mm_struct *mm) { } static inline void lru_gen_use_mm(struct mm_struct *mm) { } #endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; }; #define VMA_ITERATOR(name, __mm, __addr) \ struct vma_iterator name = { \ .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ .node = NULL, \ .status = ma_start, \ }, \ } static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { mas_init(&vmi->mas, &mm->mm_mt, addr); } #ifdef CONFIG_SCHED_MM_CID /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. */ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) { unsigned long bitmap = (unsigned long)mm; bitmap += offsetof(struct mm_struct, flexible_array); /* Skip cpu_bitmap */ bitmap += cpumask_size(); return (struct cpumask *)bitmap; } /* Accessor for struct mm_struct's cidmask. */ static inline unsigned long *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); return (unsigned long *)cid_bitmap; } void mm_init_cid(struct mm_struct *mm, struct task_struct *p); static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); if (!mm->mm_cid.pcpu) return -ENOMEM; mm_init_cid(mm, p); return 0; } # define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) static inline void mm_destroy_cid(struct mm_struct *mm) { free_percpu(mm->mm_cid.pcpu); mm->mm_cid.pcpu = NULL; } static inline unsigned int mm_cid_size(void) { /* mm_cpus_allowed(), mm_cidmask(). */ return cpumask_size() + bitmap_size(num_possible_cpus()); } /* Use 2 * NR_CPUS as worse case for static allocation. */ # define MM_CID_STATIC_SIZE (2 * sizeof(cpumask_t)) #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } static inline unsigned int mm_cid_size(void) { return 0; } # define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma); extern void tlb_finish_mmu(struct mmu_gather *tlb); struct vm_fault; /** * typedef vm_fault_t - Return type for page fault handlers. * * Page fault handlers return a bitmask of %VM_FAULT values. */ typedef __bitwise unsigned int vm_fault_t; /** * enum vm_fault_reason - Page fault handlers return a bitmask of * these values to tell the core VM what happened when handling the * fault. Used to decide whether a process gets delivered SIGBUS or * just gets major/minor fault counters bumped up. * * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits * @VM_FAULT_SIGSEGV: segmentation fault * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page * @VM_FAULT_LOCKED: ->fault locked the returned page * @VM_FAULT_RETRY: ->fault blocked, must retry * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small * @VM_FAULT_DONE_COW: ->fault has fully handled COW * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, VM_FAULT_RETRY = (__force vm_fault_t)0x000400, VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) #define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK) #define VM_FAULT_RESULT_TRACE \ { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ { VM_FAULT_NOPAGE, "NOPAGE" }, \ { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ { VM_FAULT_COMPLETED, "COMPLETED" } struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ /* * If .fault is not provided, this points to a * NULL-terminated array of pages that back the special mapping. * * This must not be NULL unless .fault is provided. */ struct page **pages; /* * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ vm_fault_t (*fault)(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); void (*close)(const struct vm_special_mapping *sm, struct vm_area_struct *vma); }; enum tlb_flush_reason { TLB_FLUSH_ON_TASK_SWITCH, TLB_REMOTE_SHOOTDOWN, TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, TLB_REMOTE_WRONG_CPU, }; /** * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. * @FAULT_FLAG_TRIED: The fault has been tried once. * @FAULT_FLAG_USER: The fault originated in userspace. * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a * COW mapping, making sure that an exclusive anon page is * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two * fault flags correctly. Currently there can be three legal combinations: * * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and * this is the first try * * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and * we've already tried at least once * * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry * * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never * be used. Note that page faults can be allowed to retry for multiple times, * in which case we'll have an initial fault with flags (a) then later on * continuous faults with flags (b). We should always try to detect pending * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, FAULT_FLAG_MKWRITE = 1 << 1, FAULT_FLAG_ALLOW_RETRY = 1 << 2, FAULT_FLAG_RETRY_NOWAIT = 1 << 3, FAULT_FLAG_KILLABLE = 1 << 4, FAULT_FLAG_TRIED = 1 << 5, FAULT_FLAG_USER = 1 << 6, FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, FAULT_FLAG_UNSHARE = 1 << 10, FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, FAULT_FLAG_VMA_LOCK = 1 << 12, }; typedef unsigned int __bitwise zap_flags_t; /* Flags for clear_young_dirty_ptes(). */ typedef int __bitwise cydp_t; /* Clear the access bit */ #define CYDP_CLEAR_YOUNG ((__force cydp_t)BIT(0)) /* Clear the dirty bit */ #define CYDP_CLEAR_DIRTY ((__force cydp_t)BIT(1)) /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each * other. Here is what they mean, and how to use them: * * * FIXME: For pages which are part of a filesystem, mappings are subject to the * lifetime enforced by the filesystem and we need guarantees that longterm * users like RDMA and V4L2 only establish mappings which coordinate usage with * the filesystem. Ideas for this coordination include revoking the longterm * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was * added after the problem with filesystems was found FS DAX VMAs are * specifically failed. Filesystem pages are still subject to bugs and use of * FOLL_LONGTERM should be avoided on those pages. * * In the CMA case: long term pins in a CMA region would unnecessarily fragment * that region. And so, CMA attempts to migrate the page before pinning, when * FOLL_LONGTERM is specified. * * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, * but an additional pin counting system) will be invoked. This is intended for * anything that gets a page reference and then touches page data (for example, * Direct IO). This lets the filesystem know that some non-file-system entity is * potentially changing the pages' data. In contrast to FOLL_GET (whose pages * are released via put_page()), FOLL_PIN pages must be released, ultimately, by * a call to unpin_user_page(). * * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different * and separate refcounting mechanisms, however, and that means that each has * its own acquire and release mechanisms: * * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. * * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. * * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based * calls applied to them, and that's perfectly OK. This is a constraint on the * callers, not on the pages.) * * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never * directly by the caller. That's in order to help avoid mismatches when * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * * Please see Documentation/core-api/pin_user_pages.rst for more information. */ enum { /* check pte is writable */ FOLL_WRITE = 1 << 0, /* do get_page on page */ FOLL_GET = 1 << 1, /* give error on hole if it would be zero */ FOLL_DUMP = 1 << 2, /* get_user_pages read/write w/o permission */ FOLL_FORCE = 1 << 3, /* * if a disk transfer is needed, start the IO and return without waiting * upon it */ FOLL_NOWAIT = 1 << 4, /* do not fault in pages */ FOLL_NOFAULT = 1 << 5, /* check page is hwpoisoned */ FOLL_HWPOISON = 1 << 6, /* don't do file mappings */ FOLL_ANON = 1 << 7, /* * FOLL_LONGTERM indicates that the page will be held for an indefinite * time period _often_ under userspace control. This is in contrast to * iov_iter_get_pages(), whose usages are transient. */ FOLL_LONGTERM = 1 << 8, /* split huge pmd before returning */ FOLL_SPLIT_PMD = 1 << 9, /* allow returning PCI P2PDMA pages */ FOLL_PCI_P2PDMA = 1 << 10, /* allow interrupts from generic signals */ FOLL_INTERRUPTIBLE = 1 << 11, /* * Always honor (trigger) NUMA hinting faults. * * FOLL_WRITE implicitly honors NUMA hinting faults because a * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE * apply). get_user_pages_fast_only() always implicitly honors NUMA * hinting faults. */ FOLL_HONOR_NUMA_FAULT = 1 << 12, /* See also internal only FOLL flags in mm/internal.h */ }; /* mm flags */ /* * The first two bits represent core dump modes for set-user-ID, * the modes are SUID_DUMP_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 #define MMF_DUMP_MAPPED_PRIVATE 4 #define MMF_DUMP_MAPPED_SHARED 5 #define MMF_DUMP_ELF_HEADERS 6 #define MMF_DUMP_HUGETLB_PRIVATE 7 #define MMF_DUMP_HUGETLB_SHARED 8 #define MMF_DUMP_DAX_PRIVATE 9 #define MMF_DUMP_DAX_SHARED 10 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \ BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS # define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ #define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ #define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ #define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into * a counter on its own. We're aggresive on this bit for now: even if the * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_MDWE 28 #define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE) #define MMF_HAS_MDWE_NO_INHERIT 29 #define MMF_VM_MERGE_ANY 30 #define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY) #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) #define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) /* Legacy flags must fit within 32 bits. */ static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX); /* * Initialise legacy flags according to masks, propagating selected flags on * fork. Further flag manipulation can be performed by the caller. */ static inline unsigned long mmf_init_legacy_flags(unsigned long flags) { if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) flags &= ~((1UL << MMF_HAS_MDWE) | (1UL << MMF_HAS_MDWE_NO_INHERIT)); return flags & MMF_INIT_LEGACY_MASK; } #endif /* _LINUX_MM_TYPES_H */
8 8 8 8 8 8 8 71 71 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - Ptrace and scope hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI * Copyright © 2024-2025 Microsoft Corporation */ #include <asm/current.h> #include <linux/cleanup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lsm_audit.h> #include <linux/lsm_hooks.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <net/af_unix.h> #include <net/sock.h> #include "audit.h" #include "common.h" #include "cred.h" #include "domain.h" #include "fs.h" #include "ruleset.h" #include "setup.h" #include "task.h" /** * domain_scope_le - Checks domain ordering for scoped ptrace * * @parent: Parent domain. * @child: Potential child of @parent. * * Checks if the @parent domain is less or equal to (i.e. an ancestor, which * means a subset of) the @child domain. */ static bool domain_scope_le(const struct landlock_ruleset *const parent, const struct landlock_ruleset *const child) { const struct landlock_hierarchy *walker; /* Quick return for non-landlocked tasks. */ if (!parent) return true; if (!child) return false; for (walker = child->hierarchy; walker; walker = walker->parent) { if (walker == parent->hierarchy) /* @parent is in the scoped hierarchy of @child. */ return true; } /* There is no relationship between @parent and @child. */ return false; } static int domain_ptrace(const struct landlock_ruleset *const parent, const struct landlock_ruleset *const child) { if (domain_scope_le(parent, child)) return 0; return -EPERM; } /** * hook_ptrace_access_check - Determines whether the current process may access * another * * @child: Process to be accessed. * @mode: Mode of attachment. * * If the current task has Landlock rules, then the child must have at least * the same rules. Else denied. * * Determines whether a process may access another, returning 0 if permission * granted, -errno if denied. */ static int hook_ptrace_access_check(struct task_struct *const child, const unsigned int mode) { const struct landlock_cred_security *parent_subject; int err; /* Quick return for non-landlocked tasks. */ parent_subject = landlock_cred(current_cred()); if (!parent_subject) return 0; scoped_guard(rcu) { const struct landlock_ruleset *const child_dom = landlock_get_task_domain(child); err = domain_ptrace(parent_subject->domain, child_dom); } if (!err) return 0; /* * For the ptrace_access_check case, we log the current/parent domain * and the child task. */ if (!(mode & PTRACE_MODE_NOAUDIT)) landlock_log_denial(parent_subject, &(struct landlock_request) { .type = LANDLOCK_REQUEST_PTRACE, .audit = { .type = LSM_AUDIT_DATA_TASK, .u.tsk = child, }, .layer_plus_one = parent_subject->domain->num_layers, }); return err; } /** * hook_ptrace_traceme - Determines whether another process may trace the * current one * * @parent: Task proposed to be the tracer. * * If the parent has Landlock rules, then the current task must have the same * or more rules. Else denied. * * Determines whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -errno if denied. */ static int hook_ptrace_traceme(struct task_struct *const parent) { const struct landlock_cred_security *parent_subject; const struct landlock_ruleset *child_dom; int err; child_dom = landlock_get_current_domain(); guard(rcu)(); parent_subject = landlock_cred(__task_cred(parent)); err = domain_ptrace(parent_subject->domain, child_dom); if (!err) return 0; /* * For the ptrace_traceme case, we log the domain which is the cause of * the denial, which means the parent domain instead of the current * domain. This may look unusual because the ptrace_traceme action is a * request to be traced, but the semantic is consistent with * hook_ptrace_access_check(). */ landlock_log_denial(parent_subject, &(struct landlock_request) { .type = LANDLOCK_REQUEST_PTRACE, .audit = { .type = LSM_AUDIT_DATA_TASK, .u.tsk = current, }, .layer_plus_one = parent_subject->domain->num_layers, }); return err; } /** * domain_is_scoped - Check if an interaction from a client/sender to a * server/receiver should be restricted based on scope controls. * * @client: IPC sender domain. * @server: IPC receiver domain. * @scope: The scope restriction criteria. * * Returns: True if @server is in a different domain from @client, and @client * is scoped to access @server (i.e. access should be denied). */ static bool domain_is_scoped(const struct landlock_ruleset *const client, const struct landlock_ruleset *const server, access_mask_t scope) { int client_layer, server_layer; const struct landlock_hierarchy *client_walker, *server_walker; /* Quick return if client has no domain */ if (WARN_ON_ONCE(!client)) return false; client_layer = client->num_layers - 1; client_walker = client->hierarchy; /* * client_layer must be a signed integer with greater capacity * than client->num_layers to ensure the following loop stops. */ BUILD_BUG_ON(sizeof(client_layer) > sizeof(client->num_layers)); server_layer = server ? (server->num_layers - 1) : -1; server_walker = server ? server->hierarchy : NULL; /* * Walks client's parent domains down to the same hierarchy level * as the server's domain, and checks that none of these client's * parent domains are scoped. */ for (; client_layer > server_layer; client_layer--) { if (landlock_get_scope_mask(client, client_layer) & scope) return true; client_walker = client_walker->parent; } /* * Walks server's parent domains down to the same hierarchy level as * the client's domain. */ for (; server_layer > client_layer; server_layer--) server_walker = server_walker->parent; for (; client_layer >= 0; client_layer--) { if (landlock_get_scope_mask(client, client_layer) & scope) { /* * Client and server are at the same level in the * hierarchy. If the client is scoped, the request is * only allowed if this domain is also a server's * ancestor. */ return server_walker != client_walker; } client_walker = client_walker->parent; server_walker = server_walker->parent; } return false; } static bool sock_is_scoped(struct sock *const other, const struct landlock_ruleset *const domain) { const struct landlock_ruleset *dom_other; /* The credentials will not change. */ lockdep_assert_held(&unix_sk(other)->lock); dom_other = landlock_cred(other->sk_socket->file->f_cred)->domain; return domain_is_scoped(domain, dom_other, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET); } static bool is_abstract_socket(struct sock *const sock) { struct unix_address *addr = unix_sk(sock)->addr; if (!addr) return false; if (addr->len >= offsetof(struct sockaddr_un, sun_path) + 1 && addr->name->sun_path[0] == '\0') return true; return false; } static const struct access_masks unix_scope = { .scope = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, }; static int hook_unix_stream_connect(struct sock *const sock, struct sock *const other, struct sock *const newsk) { size_t handle_layer; const struct landlock_cred_security *const subject = landlock_get_applicable_subject(current_cred(), unix_scope, &handle_layer); /* Quick return for non-landlocked tasks. */ if (!subject) return 0; if (!is_abstract_socket(other)) return 0; if (!sock_is_scoped(other, subject->domain)) return 0; landlock_log_denial(subject, &(struct landlock_request) { .type = LANDLOCK_REQUEST_SCOPE_ABSTRACT_UNIX_SOCKET, .audit = { .type = LSM_AUDIT_DATA_NET, .u.net = &(struct lsm_network_audit) { .sk = other, }, }, .layer_plus_one = handle_layer + 1, }); return -EPERM; } static int hook_unix_may_send(struct socket *const sock, struct socket *const other) { size_t handle_layer; const struct landlock_cred_security *const subject = landlock_get_applicable_subject(current_cred(), unix_scope, &handle_layer); if (!subject) return 0; /* * Checks if this datagram socket was already allowed to be connected * to other. */ if (unix_peer(sock->sk) == other->sk) return 0; if (!is_abstract_socket(other->sk)) return 0; if (!sock_is_scoped(other->sk, subject->domain)) return 0; landlock_log_denial(subject, &(struct landlock_request) { .type = LANDLOCK_REQUEST_SCOPE_ABSTRACT_UNIX_SOCKET, .audit = { .type = LSM_AUDIT_DATA_NET, .u.net = &(struct lsm_network_audit) { .sk = other->sk, }, }, .layer_plus_one = handle_layer + 1, }); return -EPERM; } static const struct access_masks signal_scope = { .scope = LANDLOCK_SCOPE_SIGNAL, }; static int hook_task_kill(struct task_struct *const p, struct kernel_siginfo *const info, const int sig, const struct cred *cred) { bool is_scoped; size_t handle_layer; const struct landlock_cred_security *subject; if (!cred) { /* * Always allow sending signals between threads of the same process. * This is required for process credential changes by the Native POSIX * Threads Library and implemented by the set*id(2) wrappers and * libcap(3) with tgkill(2). See nptl(7) and libpsx(3). * * This exception is similar to the __ptrace_may_access() one. */ if (same_thread_group(p, current)) return 0; /* Not dealing with USB IO. */ cred = current_cred(); } subject = landlock_get_applicable_subject(cred, signal_scope, &handle_layer); /* Quick return for non-landlocked tasks. */ if (!subject) return 0; scoped_guard(rcu) { is_scoped = domain_is_scoped(subject->domain, landlock_get_task_domain(p), signal_scope.scope); } if (!is_scoped) return 0; landlock_log_denial(subject, &(struct landlock_request) { .type = LANDLOCK_REQUEST_SCOPE_SIGNAL, .audit = { .type = LSM_AUDIT_DATA_TASK, .u.tsk = p, }, .layer_plus_one = handle_layer + 1, }); return -EPERM; } static int hook_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { const struct landlock_cred_security *subject; bool is_scoped = false; /* Lock already held by send_sigio() and send_sigurg(). */ lockdep_assert_held(&fown->lock); subject = &landlock_file(fown->file)->fown_subject; /* * Quick return for unowned socket. * * subject->domain has already been filtered when saved by * hook_file_set_fowner(), so there is no need to call * landlock_get_applicable_subject() here. */ if (!subject->domain) return 0; scoped_guard(rcu) { is_scoped = domain_is_scoped(subject->domain, landlock_get_task_domain(tsk), signal_scope.scope); } if (!is_scoped) return 0; landlock_log_denial(subject, &(struct landlock_request) { .type = LANDLOCK_REQUEST_SCOPE_SIGNAL, .audit = { .type = LSM_AUDIT_DATA_TASK, .u.tsk = tsk, }, #ifdef CONFIG_AUDIT .layer_plus_one = landlock_file(fown->file)->fown_layer + 1, #endif /* CONFIG_AUDIT */ }); return -EPERM; } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme), LSM_HOOK_INIT(unix_stream_connect, hook_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, hook_unix_may_send), LSM_HOOK_INIT(task_kill, hook_task_kill), LSM_HOOK_INIT(file_send_sigiotask, hook_file_send_sigiotask), }; __init void landlock_add_task_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); }
2 11 9 2 3 1 4 2 11 34 3 6 1 1 2 21 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 // SPDX-License-Identifier: GPL-2.0-only /* * File: datagram.c * * Datagram (ISI) Phonet sockets * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/socket.h> #include <asm/ioctls.h> #include <net/sock.h> #include <linux/phonet.h> #include <linux/export.h> #include <net/phonet/phonet.h> static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb); /* associated socket ceases to exist */ static void pn_sock_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int pn_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: { u32 res = *karg; if (res >= 256) return -EINVAL; if (cmd == SIOCPNADDRESOURCE) return pn_sock_bind_res(sk, res); else return pn_sock_unbind_res(sk, res); } } return -ENOIOCTLCMD; } /* Destroy socket. All references are gone. */ static void pn_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } static int pn_init(struct sock *sk) { sk->sk_destruct = pn_destruct; return 0; } static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name); struct sk_buff *skb; int err; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) return -EOPNOTSUPP; if (target == NULL) return -EDESTADDRREQ; if (msg->msg_namelen < sizeof(struct sockaddr_pn)) return -EINVAL; if (target->spn_family != AF_PHONET) return -EAFNOSUPPORT; skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) return err; skb_reserve(skb, MAX_PHONET_HEADER); err = memcpy_from_msg((void *)skb_put(skb, len), msg, len); if (err < 0) { kfree_skb(skb); return err; } /* * Fill in the Phonet header and * finally pass the packet forwards. */ err = pn_skb_send(sk, skb, target); /* If ok, return len. */ return (err >= 0) ? len : err; } static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_buff *skb = NULL; struct sockaddr_pn sa; int rval = -EOPNOTSUPP; int copylen; if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) goto out_nofree; skb = skb_recv_datagram(sk, flags, &rval); if (skb == NULL) goto out_nofree; pn_skb_get_src_sockaddr(skb, &sa); copylen = skb->len; if (len < copylen) { msg->msg_flags |= MSG_TRUNC; copylen = len; } rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; } rval = (flags & MSG_TRUNC) ? skb->len : copylen; if (msg->msg_name != NULL) { __sockaddr_check_size(sizeof(sa)); memcpy(msg->msg_name, &sa, sizeof(sa)); *addr_len = sizeof(sa); } out: skb_free_datagram(sk, skb); out_nofree: return rval; } /* Queue an skb for a sock. */ static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); return err ? NET_RX_DROP : NET_RX_SUCCESS; } /* Module registration */ static struct proto pn_proto = { .close = pn_sock_close, .ioctl = pn_ioctl, .init = pn_init, .sendmsg = pn_sendmsg, .recvmsg = pn_recvmsg, .backlog_rcv = pn_backlog_rcv, .hash = pn_sock_hash, .unhash = pn_sock_unhash, .get_port = pn_sock_get_port, .obj_size = sizeof(struct pn_sock), .owner = THIS_MODULE, .name = "PHONET", }; static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, }; int __init isi_register(void) { return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto); } void __exit isi_unregister(void) { phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto); }
102 280 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif #ifdef CONFIG_CFI #define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) struct { MEMBERS } #else #define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) union { MEMBERS } #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in a * new subdirectory with this name. Additionally when a * group is named, @is_visible and @is_bin_visible may * return SYSFS_GROUP_INVISIBLE to control visibility of * the directory itself. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned * value will replace static permissions defined in struct * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this * callback to specify separate _group_visible() and * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC (and the * visibility flags for named groups) are accepted. Must * return 0 if a binary attribute is not visible. The * returned value will replace static permissions defined * in struct bin_attribute. If @is_visible is not set, Use * SYSFS_GROUP_VISIBLE() when assigning this callback to * specify separate _group_visible() and _attr_visible() * handlers. * @bin_size: * Optional: Function to return the size of a binary attribute * of the group. Will be called repeatedly for each binary * attribute in the group. Overwrites the size field embedded * inside the attribute itself. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; __SYSFS_FUNCTION_ALTERNATIVE( umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_visible_const)(struct kobject *, const struct attribute *, int); ); umode_t (*is_bin_visible)(struct kobject *, const struct bin_attribute *, int); size_t (*bin_size)(struct kobject *, const struct bin_attribute *, int); union { struct attribute **attrs; const struct attribute *const *attrs_const; }; const struct bin_attribute *const *bin_attrs; }; #define SYSFS_PREALLOC 010000 #define SYSFS_GROUP_INVISIBLE 020000 /* * DEFINE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with the assignment of ".is_visible = * SYSFS_GROUP_VISIBLE(name)", that arranges for the directory * associated with a named attribute_group to optionally be hidden. * This allows for static declaration of attribute_groups, and the * simplification of attribute visibility lifetime that implies, * without polluting sysfs with empty attribute directories. * Ex. * * static umode_t example_attr_visible(struct kobject *kobj, * struct attribute *attr, int n) * { * if (example_attr_condition) * return 0; * else if (ro_attr_condition) * return 0444; * return a->mode; * } * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; * * Note that it expects <name>_attr_visible and <name>_group_visible to * be defined. For cases where individual attributes do not need * separate visibility consideration, only entire group visibility at * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(). */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } /* * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with SYSFS_GROUP_VISIBLE() that like * DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does * not require the implementation of a per-attribute visibility * callback. * Ex. * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; */ #define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary * attributes. If an attribute_group defines both text and binary * attributes, the group visibility is determined by the function * specified to is_visible() not is_bin_visible() */ #define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } #define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RO(_name) \ __ATTR_RO_MODE(_name, 0444) #define __ATTR_RW_MODE(_name, _mode) \ __ATTR(_name, _mode, _name##_show, _name##_store) #define __ATTR_WO(_name) \ __ATTR(_name, 0200, NULL, _name##_store) #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _Generic(_name##_attrs, \ struct attribute **: \ _name##_attrs, \ const struct attribute *const *: \ (void *)_name##_attrs \ ), \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) \ __BIN_ATTR(_name, 0444, _name##_read, NULL, _size) #define __BIN_ATTR_WO(_name, _size) \ __BIN_ATTR(_name, 0200, NULL, _name##_write, _size) #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) \ __BIN_ATTR(_name, 0400, _name##_read, NULL, _size) #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) #define __BIN_ATTR_SIMPLE_RO(_name, _mode) \ __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0) #define BIN_ATTR_SIMPLE_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444) #define BIN_ATTR_SIMPLE_ADMIN_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } /* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ #define VERIFY_OCTAL_PERMISSIONS(perms) \ (BUILD_BUG_ON_ZERO((perms) < 0) + \ BUILD_BUG_ON_ZERO((perms) > 0777) + \ /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ /* USER_WRITABLE >= GROUP_WRITABLE */ \ BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ /* OTHER_WRITABLE? Generally considered a bad idea. */ \ BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) #endif /* _SYSFS_H_ */
602 2 2 168 592 601 605 606 603 598 602 603 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 // SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/freezer.h> #include <trace/events/module.h> static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) (*info->cleanup)(info); kfree(info); } static void umh_complete(struct subprocess_info *sub_info) { struct completion *comp = xchg(&sub_info->complete, NULL); /* * See call_usermodehelper_exec(). If xchg() returns NULL * we own sub_info, the UMH_KILLABLE caller has gone away * or the caller used UMH_NO_WAIT. */ if (comp) complete(comp); else call_usermodehelper_freeinfo(sub_info); } /* * This is the task which runs the usermode application */ static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; int retval; spin_lock_irq(&current->sighand->siglock); flush_signal_handlers(current, 1); spin_unlock_irq(&current->sighand->siglock); /* * Initial kernel threads share ther FS with init, in order to * get the init root directory. But we've now created a new * thread that is going to execve a user process and has its own * 'struct fs_struct'. Reset umask to the default. */ current->fs->umask = 0022; /* * Our parent (unbound workqueue) runs with elevated scheduling * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); new->cap_inheritable = cap_intersect(usermodehelper_inheritable, new->cap_inheritable); spin_unlock(&umh_sysctl_lock); if (sub_info->init) { retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); goto out; } } commit_creds(new); wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); out: sub_info->retval = retval; /* * call_usermodehelper_exec_sync() will call umh_complete * if UHM_WAIT_PROC. */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) return 0; do_exit(0); } /* Handles UMH_WAIT_PROC. */ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); umh_complete(sub_info); } /* * We need to create the usermodehelper kernel thread from a task that is affine * to an optimized set of CPUs (or nohz housekeeping ones) such that they * inherit a widest affinity irrespective of call_usermodehelper() callers with * possibly reduced affinity (eg: per-cpu workqueues). We don't want * usermodehelper targets to contend a busy CPU. * * Unbound workqueues provide such wide affinity and allow to block on * UMH_WAIT_PROC requests without blocking pending request (up to some limit). * * Besides, workqueues provide the privilege level that caller might not have * to perform the usermodehelper request. * */ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); } else { pid_t pid; /* * Use CLONE_PARENT to reparent it to kthreadd; we do not * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); } } } /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user * land has been frozen during a system-wide hibernation or suspend operation). * Should always be manipulated under umhelper_sem acquired for write. */ static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); /* * Wait queue head used by usermodehelper_disable() to wait for all running * helpers to finish. */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled * to become 'false'. */ static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); /* * Time to wait for running_helpers to become zero before the setting of * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) int usermodehelper_read_trylock(void) { DEFINE_WAIT(wait); int ret = 0; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_INTERRUPTIBLE); if (!usermodehelper_disabled) break; if (usermodehelper_disabled == UMH_DISABLED) ret = -EAGAIN; up_read(&umhelper_sem); if (ret) break; schedule(); try_to_freeze(); down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return ret; } EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); long usermodehelper_read_lock_wait(long timeout) { DEFINE_WAIT(wait); if (timeout < 0) return -EINVAL; down_read(&umhelper_sem); for (;;) { prepare_to_wait(&usermodehelper_disabled_waitq, &wait, TASK_UNINTERRUPTIBLE); if (!usermodehelper_disabled) break; up_read(&umhelper_sem); timeout = schedule_timeout(timeout); if (!timeout) break; down_read(&umhelper_sem); } finish_wait(&usermodehelper_disabled_waitq, &wait); return timeout; } EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. * @depth: New value to assign to usermodehelper_disabled. * * Change the value of usermodehelper_disabled (under umhelper_sem locked for * writing) and wakeup tasks waiting for it to change. */ void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) { down_write(&umhelper_sem); usermodehelper_disabled = depth; wake_up(&usermodehelper_disabled_waitq); up_write(&umhelper_sem); } /** * __usermodehelper_disable - Prevent new helpers from being started. * @depth: New value to assign to usermodehelper_disabled. * * Set usermodehelper_disabled to @depth and wait for running helpers to exit. */ int __usermodehelper_disable(enum umh_disable_depth depth) { long retval; if (!depth) return -EINVAL; down_write(&umhelper_sem); usermodehelper_disabled = depth; up_write(&umhelper_sem); /* * From now on call_usermodehelper_exec() won't start any new * helpers, so it is sufficient if running_helpers turns out to * be zero at one point (it may be increased later, but that * doesn't matter). */ retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); if (retval) return 0; __usermodehelper_set_disable_depth(UMH_ENABLED); return -EAGAIN; } static void helper_lock(void) { atomic_inc(&running_helpers); smp_mb__after_atomic(); } static void helper_unlock(void) { if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } /** * call_usermodehelper_setup - prepare to call a usermode helper * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation * @init: an init function * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. * * The init function is used to customize the helper process prior to * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; sub_info = kzalloc_obj(struct subprocess_info, gfp_mask); if (!sub_info) goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); #ifdef CONFIG_STATIC_USERMODEHELPER sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; #else sub_info->path = path; #endif sub_info->argv = argv; sub_info->envp = envp; sub_info->cleanup = cleanup; sub_info->init = init; sub_info->data = data; out: return sub_info; } EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). * * Note: successful return value does not guarantee the helper was called at * all. You can't rely on sub_info->{init,cleanup} being called even for * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; if (!sub_info->path) { call_usermodehelper_freeinfo(sub_info); return -EINVAL; } helper_lock(); if (usermodehelper_disabled) { retval = -EBUSY; goto out; } /* * If there is no binary for us to call, then just return and get out of * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and * disable all call_usermodehelper() calls. */ if (strlen(sub_info->path) == 0) goto out; /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free * the data structure in case of UMH_NO_WAIT. */ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; if (wait & UMH_KILLABLE) { retval = wait_for_completion_state(&done, state | TASK_KILLABLE); if (!retval) goto wait_done; /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; /* * fallthrough; in case of -ERESTARTSYS now do uninterruptible * wait_for_completion_state(). Since umh_complete() shall call * complete() in a moment if xchg() above returned NULL, this * uninterruptible wait_for_completion_state() will not block * SIGKILL'ed processes for long. */ } wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; out: call_usermodehelper_freeinfo(sub_info); unlock: helper_unlock(); return retval; } EXPORT_SYMBOL(call_usermodehelper_exec); /** * call_usermodehelper() - prepare and start a usermode application * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; info = call_usermodehelper_setup(path, argv, envp, gfp_mask, NULL, NULL, NULL); if (info == NULL) return -ENOMEM; return call_usermodehelper_exec(info, wait); } EXPORT_SYMBOL(call_usermodehelper); #if defined(CONFIG_SYSCTL) static int proc_cap_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[2]; kernel_cap_t new_cap, *cap; int err; if (write && (!capable(CAP_SETPCAP) || !capable(CAP_SYS_MODULE))) return -EPERM; /* * convert from the global kernel_cap_t to the ulong array to print to * userspace if this is a read. * * Legacy format: capabilities are exposed as two 32-bit values */ cap = table->data; spin_lock(&umh_sysctl_lock); cap_array[0] = (u32) cap->val; cap_array[1] = cap->val >> 32; spin_unlock(&umh_sysctl_lock); t = *table; t.data = &cap_array; /* * actually read or write and array of ulongs from userspace. Remember * these are least significant 32 bits first */ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; new_cap.val = (u32)cap_array[0]; new_cap.val += (u64)cap_array[1] << 32; /* * Drop everything not in the new_cap (but don't add things) */ if (write) { spin_lock(&umh_sysctl_lock); *cap = cap_intersect(*cap, new_cap); spin_unlock(&umh_sysctl_lock); } return 0; } static const struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, { .procname = "inheritable", .data = &usermodehelper_inheritable, .maxlen = 2 * sizeof(unsigned long), .mode = 0600, .proc_handler = proc_cap_handler, }, }; static int __init init_umh_sysctls(void) { register_sysctl_init("kernel/usermodehelper", usermodehelper_table); return 0; } early_initcall(init_umh_sysctls); #endif /* CONFIG_SYSCTL */
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_region { struct devlink *devlink; struct devlink_port *port; struct list_head list; union { const struct devlink_region_ops *ops; const struct devlink_port_region_ops *port_ops; }; struct mutex snapshot_lock; /* protects snapshot_list, * max_snapshots and cur_snapshots * consistency. */ struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; u64 size; }; struct devlink_snapshot { struct list_head list; struct devlink_region *region; u8 *data; u32 id; }; static struct devlink_region * devlink_region_get_by_name(struct devlink *devlink, const char *region_name) { struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) if (!strcmp(region->ops->name, region_name)) return region; return NULL; } static struct devlink_region * devlink_port_region_get_by_name(struct devlink_port *port, const char *region_name) { struct devlink_region *region; list_for_each_entry(region, &port->region_list, list) if (!strcmp(region->port_ops->name, region_name)) return region; return NULL; } static struct devlink_snapshot * devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) { struct devlink_snapshot *snapshot; list_for_each_entry(snapshot, &region->snapshot_list, list) if (snapshot->id == id) return snapshot; return NULL; } static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) { struct nlattr *snap_attr; int err; snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); if (!snap_attr) return -EMSGSIZE; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) goto nla_put_failure; nla_nest_end(msg, snap_attr); return 0; nla_put_failure: nla_nest_cancel(msg, snap_attr); return err; } static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_region *region) { struct devlink_snapshot *snapshot; struct nlattr *snapshots_attr; int err; snapshots_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); if (!snapshots_attr) return -EMSGSIZE; list_for_each_entry(snapshot, &region->snapshot_list, list) { err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); if (err) goto nla_put_failure; } nla_nest_end(msg, snapshots_attr); return 0; nla_put_failure: nla_nest_cancel(msg, snapshots_attr); return err; } static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct devlink_region *region) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; err = devlink_nl_put_handle(msg, devlink); if (err) goto nla_put_failure; if (region->port) { err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto nla_put_failure; } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto nla_put_failure; err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size); if (err) goto nla_put_failure; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, region->max_snapshots); if (err) goto nla_put_failure; err = devlink_nl_region_snapshots_id_put(msg, devlink, region); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return err; } static struct sk_buff * devlink_nl_region_notify_build(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); if (!hdr) { err = -EMSGSIZE; goto out_free_msg; } err = devlink_nl_put_handle(msg, devlink); if (err) goto out_cancel_msg; if (region->port) { err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto out_cancel_msg; } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto out_cancel_msg; if (snapshot) { err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) goto out_cancel_msg; } else { err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size); if (err) goto out_cancel_msg; } genlmsg_end(msg, hdr); return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); return ERR_PTR(err); } static void devlink_nl_region_notify(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd) { struct devlink *devlink = region->devlink; struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); if (IS_ERR(msg)) return; devlink_nl_notify_send(devlink, msg); } void devlink_regions_notify_register(struct devlink *devlink) { struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); } void devlink_regions_notify_unregister(struct devlink *devlink) { struct devlink_region *region; list_for_each_entry_reverse(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); } /** * __devlink_snapshot_id_increment - Increment number of snapshots using an id * @devlink: devlink instance * @id: the snapshot id * * Track when a new snapshot begins using an id. Load the count for the * given id from the snapshot xarray, increment it, and store it back. * * Called when a new snapshot is created with the given id. * * The id *must* have been previously allocated by * devlink_region_snapshot_id_get(). * * Returns 0 on success, or an error on failure. */ static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) { unsigned long count; void *p; int err; xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) { err = -EINVAL; goto unlock; } if (WARN_ON(!xa_is_value(p))) { err = -EINVAL; goto unlock; } count = xa_to_value(p); count++; err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), GFP_ATOMIC)); unlock: xa_unlock(&devlink->snapshot_ids); return err; } /** * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id * @devlink: devlink instance * @id: the snapshot id * * Track when a snapshot is deleted and stops using an id. Load the count * for the given id from the snapshot xarray, decrement it, and store it * back. * * If the count reaches zero, erase this id from the xarray, freeing it * up for future re-use by devlink_region_snapshot_id_get(). * * Called when a snapshot using the given id is deleted, and when the * initial allocator of the id is finished using it. */ static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) { unsigned long count; void *p; xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) goto unlock; if (WARN_ON(!xa_is_value(p))) goto unlock; count = xa_to_value(p); if (count > 1) { count--; __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), GFP_ATOMIC); } else { /* If this was the last user, we can erase this id */ __xa_erase(&devlink->snapshot_ids, id); } unlock: xa_unlock(&devlink->snapshot_ids); } /** * __devlink_snapshot_id_insert - Insert a specific snapshot ID * @devlink: devlink instance * @id: the snapshot id * * Mark the given snapshot id as used by inserting a zero value into the * snapshot xarray. * * This must be called while holding the devlink instance lock. Unlike * devlink_snapshot_id_get, the initial reference count is zero, not one. * It is expected that the id will immediately be used before * releasing the devlink instance lock. * * Returns zero on success, or an error code if the snapshot id could not * be inserted. */ static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) { int err; xa_lock(&devlink->snapshot_ids); if (xa_load(&devlink->snapshot_ids, id)) { xa_unlock(&devlink->snapshot_ids); return -EEXIST; } err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), GFP_ATOMIC)); xa_unlock(&devlink->snapshot_ids); return err; } /** * __devlink_region_snapshot_id_get - get snapshot ID * @devlink: devlink instance * @id: storage to return snapshot id * * Allocates a new snapshot id. Returns zero on success, or a negative * error on failure. Must be called while holding the devlink instance * lock. * * Snapshot IDs are tracked using an xarray which stores the number of * users of the snapshot id. * * Note that the caller of this function counts as a 'user', in order to * avoid race conditions. The caller must release its hold on the * snapshot by using devlink_region_snapshot_id_put. */ static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), xa_limit_32b, GFP_KERNEL); } /** * __devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * Must be called only while holding the region snapshot lock. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created */ static int __devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot; int err; lockdep_assert_held(&region->snapshot_lock); /* check if region can hold one more snapshot */ if (region->cur_snapshots == region->max_snapshots) return -ENOSPC; if (devlink_region_snapshot_get_by_id(region, snapshot_id)) return -EEXIST; snapshot = kzalloc_obj(*snapshot); if (!snapshot) return -ENOMEM; err = __devlink_snapshot_id_increment(devlink, snapshot_id); if (err) goto err_snapshot_id_increment; snapshot->id = snapshot_id; snapshot->region = region; snapshot->data = data; list_add_tail(&snapshot->list, &region->snapshot_list); region->cur_snapshots++; devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); return 0; err_snapshot_id_increment: kfree(snapshot); return err; } static void devlink_region_snapshot_del(struct devlink_region *region, struct devlink_snapshot *snapshot) { struct devlink *devlink = region->devlink; lockdep_assert_held(&region->snapshot_lock); devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); region->cur_snapshots--; list_del(&snapshot->list); region->ops->destructor(snapshot->data); __devlink_snapshot_id_decrement(devlink, snapshot->id); kfree(snapshot); } int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; struct sk_buff *msg; unsigned int index; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) return -EINVAL; if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, info->snd_portid, info->snd_seq, 0, region); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, struct netlink_callback *cb, struct devlink_port *port, int *idx, int start, int flags) { struct devlink_region *region; int err = 0; list_for_each_entry(region, &port->region_list, list) { if (*idx < start) { (*idx)++; continue; } err = devlink_nl_region_fill(msg, port->devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, region); if (err) goto out; (*idx)++; } out: return err; } static int devlink_nl_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; list_for_each_entry(region, &devlink->region_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, region); if (err) { state->idx = idx; return err; } idx++; } xa_for_each(&devlink->ports, port_index, port) { err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, state->idx, flags); if (err) { state->idx = idx; return err; } } return 0; } int devlink_nl_region_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); } int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; unsigned int index; u32 snapshot_id; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) return -EINVAL; region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) return -EINVAL; mutex_lock(&region->snapshot_lock); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (!snapshot) { mutex_unlock(&region->snapshot_lock); return -EINVAL; } devlink_region_snapshot_del(region, snapshot); mutex_unlock(&region->snapshot_lock); return 0; } int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; struct devlink_port *port = NULL; struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; unsigned int index; u32 snapshot_id; u8 *data; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { NL_SET_ERR_MSG(info->extack, "No region name provided"); return -EINVAL; } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) { NL_SET_ERR_MSG(info->extack, "The requested region does not exist"); return -EINVAL; } if (!region->ops->snapshot) { NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot"); return -EOPNOTSUPP; } mutex_lock(&region->snapshot_lock); if (region->cur_snapshots == region->max_snapshots) { NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots"); err = -ENOSPC; goto unlock; } snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; if (snapshot_id_attr) { snapshot_id = nla_get_u32(snapshot_id_attr); if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use"); err = -EEXIST; goto unlock; } err = __devlink_snapshot_id_insert(devlink, snapshot_id); if (err) goto unlock; } else { err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); if (err) { NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id"); goto unlock; } } if (port) err = region->port_ops->snapshot(port, region->port_ops, info->extack, &data); else err = region->ops->snapshot(devlink, region->ops, info->extack, &data); if (err) goto err_snapshot_capture; err = __devlink_region_snapshot_create(region, data, snapshot_id); if (err) goto err_snapshot_create; if (!snapshot_id_attr) { struct sk_buff *msg; snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (WARN_ON(!snapshot)) { err = -EINVAL; goto unlock; } msg = devlink_nl_region_notify_build(region, snapshot, DEVLINK_CMD_REGION_NEW, info->snd_portid, info->snd_seq); err = PTR_ERR_OR_ZERO(msg); if (err) goto err_notify; err = genlmsg_reply(msg, info); if (err) goto err_notify; } mutex_unlock(&region->snapshot_lock); return 0; err_snapshot_create: region->ops->destructor(data); err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); mutex_unlock(&region->snapshot_lock); return err; err_notify: devlink_region_snapshot_del(region, snapshot); unlock: mutex_unlock(&region->snapshot_lock); return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, u8 *chunk, u32 chunk_size, u64 addr) { struct nlattr *chunk_attr; int err; chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); if (!chunk_attr) return -EINVAL; err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); if (err) goto nla_put_failure; err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr); if (err) goto nla_put_failure; nla_nest_end(msg, chunk_attr); return 0; nla_put_failure: nla_nest_cancel(msg, chunk_attr); return err; } #define DEVLINK_REGION_READ_CHUNK_SIZE 256 typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack); static int devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, void *cb_priv, u64 start_offset, u64 end_offset, u64 *new_offset, struct netlink_ext_ack *extack) { u64 curr_offset = start_offset; int err = 0; u8 *data; /* Allocate and re-use a single buffer */ data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); if (!data) return -ENOMEM; *new_offset = start_offset; while (curr_offset < end_offset) { u32 data_size; data_size = min_t(u32, end_offset - curr_offset, DEVLINK_REGION_READ_CHUNK_SIZE); err = cb(cb_priv, data, data_size, curr_offset, extack); if (err) break; err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); if (err) break; curr_offset += data_size; } *new_offset = curr_offset; kfree(data); return err; } static int devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack __always_unused *extack) { struct devlink_snapshot *snapshot = cb_priv; memcpy(chunk, &snapshot->data[curr_offset], chunk_size); return 0; } static int devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack) { struct devlink_region *region = cb_priv; return region->port_ops->read(region->port, region->port_ops, extack, curr_offset, chunk_size, chunk); } static int devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack) { struct devlink_region *region = cb_priv; return region->ops->read(region->devlink, region->ops, extack, curr_offset, chunk_size, chunk); } int devlink_nl_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->info.attrs; struct devlink_port *port = NULL; devlink_chunk_fill_t *region_cb; struct devlink_region *region; const char *region_name; struct devlink *devlink; unsigned int index; void *region_cb_priv; void *hdr; int err; start_offset = state->start_offset; devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, false); if (IS_ERR(devlink)) return PTR_ERR(devlink); if (!attrs[DEVLINK_ATTR_REGION_NAME]) { NL_SET_ERR_MSG(cb->extack, "No region name provided"); err = -EINVAL; goto out_unlock; } if (attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) { err = -ENODEV; goto out_unlock; } } region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; region_name = nla_data(region_attr); if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) { NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); err = -EINVAL; goto out_unlock; } snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; if (!snapshot_attr) { if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); err = -EINVAL; goto out_unlock; } if (!region->ops->read) { NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); err = -EOPNOTSUPP; goto out_unlock; } if (port) region_cb = &devlink_region_port_direct_fill; else region_cb = &devlink_region_direct_fill; region_cb_priv = region; } else { struct devlink_snapshot *snapshot; u32 snapshot_id; if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); err = -EINVAL; goto out_unlock; } snapshot_id = nla_get_u32(snapshot_attr); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (!snapshot) { NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); err = -EINVAL; goto out_unlock; } region_cb = &devlink_region_snapshot_fill; region_cb_priv = snapshot; } if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { if (!start_offset) start_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); } if (end_offset > region->size) end_offset = region->size; /* return 0 if there is no further data to read */ if (start_offset == end_offset) { err = 0; goto out_unlock; } hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); if (!hdr) { err = -EMSGSIZE; goto out_unlock; } err = devlink_nl_put_handle(skb, devlink); if (err) goto nla_put_failure; if (region->port) { err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto nla_put_failure; } err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); if (err) goto nla_put_failure; chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); if (!chunks_attr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, start_offset, end_offset, &ret_offset, cb->extack); if (err && err != -EMSGSIZE) goto nla_put_failure; /* Check if there was any progress done to prevent infinite loop */ if (ret_offset == start_offset) { err = -EINVAL; goto nla_put_failure; } state->start_offset = ret_offset; nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); devl_unlock(devlink); devlink_put(devlink); return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: devl_unlock(devlink); devlink_put(devlink); return err; } /** * devl_region_create - create a new address region * * @devlink: devlink * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region */ struct devlink_region *devl_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; devl_assert_locked(devlink); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); if (devlink_region_get_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); region = kzalloc_obj(*region); if (!region) return ERR_PTR(-ENOMEM); region->devlink = devlink; region->max_snapshots = region_max_snapshots; region->ops = ops; region->size = region_size; INIT_LIST_HEAD(&region->snapshot_list); mutex_init(&region->snapshot_lock); list_add_tail(&region->list, &devlink->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); return region; } EXPORT_SYMBOL_GPL(devl_region_create); /** * devlink_region_create - create a new address region * * @devlink: devlink * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region * * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; devl_lock(devlink); region = devl_region_create(devlink, ops, region_max_snapshots, region_size); devl_unlock(devlink); return region; } EXPORT_SYMBOL_GPL(devlink_region_create); /** * devlink_port_region_create - create a new address region for a port * * @port: devlink port * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region * * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_port_region_create(struct devlink_port *port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink *devlink = port->devlink; struct devlink_region *region; int err = 0; ASSERT_DEVLINK_PORT_INITIALIZED(port); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); devl_lock(devlink); if (devlink_port_region_get_by_name(port, ops->name)) { err = -EEXIST; goto unlock; } region = kzalloc_obj(*region); if (!region) { err = -ENOMEM; goto unlock; } region->devlink = devlink; region->port = port; region->max_snapshots = region_max_snapshots; region->port_ops = ops; region->size = region_size; INIT_LIST_HEAD(&region->snapshot_list); mutex_init(&region->snapshot_lock); list_add_tail(&region->list, &port->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); devl_unlock(devlink); return region; unlock: devl_unlock(devlink); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(devlink_port_region_create); /** * devl_region_destroy - destroy address region * * @region: devlink region to destroy */ void devl_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot, *ts; devl_assert_locked(devlink); /* Free all snapshots of region */ mutex_lock(&region->snapshot_lock); list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); mutex_unlock(&region->snapshot_lock); list_del(&region->list); mutex_destroy(&region->snapshot_lock); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); kfree(region); } EXPORT_SYMBOL_GPL(devl_region_destroy); /** * devlink_region_destroy - destroy address region * * @region: devlink region to destroy * * Context: Takes and release devlink->lock <mutex>. */ void devlink_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; devl_lock(devlink); devl_region_destroy(region); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_region_destroy); /** * devlink_region_snapshot_id_get - get snapshot ID * * This callback should be called when adding a new snapshot, * Driver should use the same id for multiple snapshots taken * on multiple regions at the same time/by the same trigger. * * The caller of this function must use devlink_region_snapshot_id_put * when finished creating regions using this id. * * Returns zero on success, or a negative error code on failure. * * @devlink: devlink * @id: storage to return id */ int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { return __devlink_region_snapshot_id_get(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** * devlink_region_snapshot_id_put - put snapshot ID reference * * This should be called by a driver after finishing creating snapshots * with an id. Doing so ensures that the ID can later be released in the * event that all snapshots using it have been destroyed. * * @devlink: devlink * @id: id to release reference on */ void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) { __devlink_snapshot_id_decrement(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); /** * devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created */ int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { int err; mutex_lock(&region->snapshot_lock); err = __devlink_region_snapshot_create(region, data, snapshot_id); mutex_unlock(&region->snapshot_lock); return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 // SPDX-License-Identifier: GPL-2.0 /* * Implement the default iomap interfaces * * (C) Copyright 2004 Linus Torvalds */ #include <linux/pci.h> #include <linux/io.h> #include <linux/kmsan-checks.h> #include <linux/export.h> /* * Read/write from/to an (offsettable) iomem cookie. It might be a PIO * access or a MMIO access, these functions don't care. The info is * encoded in the hardware mapping set up by the mapping functions * (or the cookie itself, depending on implementation and hw). * * The generic routines don't assume any hardware mappings, and just * encode the PIO/MMIO as part of the cookie. They coldly assume that * the MMIO IO mappings are not in the low address range. * * Architectures for which this is not true can't use this generic * implementation and should do their own copy. */ #ifndef HAVE_ARCH_PIO_SIZE /* * We encode the physical PIO addresses (0-0xffff) into the * pointer by offsetting them with a constant (0x10000) and * assuming that all the low addresses are always PIO. That means * we can do some sanity checks on the low bits, and don't * need to just take things for granted. */ #define PIO_OFFSET 0x10000UL #define PIO_MASK 0x0ffffUL #define PIO_RESERVED 0x40000UL #endif static void bad_io_access(unsigned long port, const char *access) { static int count = 10; if (count) { count--; WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access); } } /* * Ugly macros are a way of life. */ #define IO_COND(addr, is_pio, is_mmio) do { \ unsigned long port = (unsigned long __force)addr; \ if (port >= PIO_RESERVED) { \ is_mmio; \ } else if (port > PIO_OFFSET) { \ port &= PIO_MASK; \ is_pio; \ } else \ bad_io_access(port, #is_pio ); \ } while (0) #ifndef pio_read16be #define pio_read16be(port) swab16(inw(port)) #define pio_read32be(port) swab32(inl(port)) #endif #ifndef mmio_read16be #define mmio_read16be(addr) swab16(readw(addr)) #define mmio_read32be(addr) swab32(readl(addr)) #define mmio_read64be(addr) swab64(readq(addr)) #endif /* * Here and below, we apply __no_kmsan_checks to functions reading data from * hardware, to ensure that KMSAN marks their return values as initialized. */ __no_kmsan_checks unsigned int ioread8(const void __iomem *addr) { IO_COND(addr, return inb(port), return readb(addr)); return 0xff; } __no_kmsan_checks unsigned int ioread16(const void __iomem *addr) { IO_COND(addr, return inw(port), return readw(addr)); return 0xffff; } __no_kmsan_checks unsigned int ioread16be(const void __iomem *addr) { IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); return 0xffff; } __no_kmsan_checks unsigned int ioread32(const void __iomem *addr) { IO_COND(addr, return inl(port), return readl(addr)); return 0xffffffff; } __no_kmsan_checks unsigned int ioread32be(const void __iomem *addr) { IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); return 0xffffffff; } EXPORT_SYMBOL(ioread8); EXPORT_SYMBOL(ioread16); EXPORT_SYMBOL(ioread16be); EXPORT_SYMBOL(ioread32); EXPORT_SYMBOL(ioread32be); #ifdef CONFIG_64BIT static u64 pio_read64_lo_hi(unsigned long port) { u64 lo, hi; lo = inl(port); hi = inl(port + sizeof(u32)); return lo | (hi << 32); } static u64 pio_read64_hi_lo(unsigned long port) { u64 lo, hi; hi = inl(port + sizeof(u32)); lo = inl(port); return lo | (hi << 32); } static u64 pio_read64be_lo_hi(unsigned long port) { u64 lo, hi; lo = pio_read32be(port + sizeof(u32)); hi = pio_read32be(port); return lo | (hi << 32); } static u64 pio_read64be_hi_lo(unsigned long port) { u64 lo, hi; hi = pio_read32be(port); lo = pio_read32be(port + sizeof(u32)); return lo | (hi << 32); } __no_kmsan_checks u64 __ioread64_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr)); return 0xffffffffffffffffULL; } __no_kmsan_checks u64 __ioread64_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr)); return 0xffffffffffffffffULL; } __no_kmsan_checks u64 __ioread64be_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64be_lo_hi(port), return mmio_read64be(addr)); return 0xffffffffffffffffULL; } __no_kmsan_checks u64 __ioread64be_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64be_hi_lo(port), return mmio_read64be(addr)); return 0xffffffffffffffffULL; } EXPORT_SYMBOL(__ioread64_lo_hi); EXPORT_SYMBOL(__ioread64_hi_lo); EXPORT_SYMBOL(__ioread64be_lo_hi); EXPORT_SYMBOL(__ioread64be_hi_lo); #endif /* CONFIG_64BIT */ #ifndef pio_write16be #define pio_write16be(val,port) outw(swab16(val),port) #define pio_write32be(val,port) outl(swab32(val),port) #endif #ifndef mmio_write16be #define mmio_write16be(val,port) writew(swab16(val),port) #define mmio_write32be(val,port) writel(swab32(val),port) #define mmio_write64be(val,port) writeq(swab64(val),port) #endif void iowrite8(u8 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outb(val,port), writeb(val, addr)); } void iowrite16(u16 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outw(val,port), writew(val, addr)); } void iowrite16be(u16 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); } void iowrite32(u32 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outl(val,port), writel(val, addr)); } void iowrite32be(u32 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); } EXPORT_SYMBOL(iowrite8); EXPORT_SYMBOL(iowrite16); EXPORT_SYMBOL(iowrite16be); EXPORT_SYMBOL(iowrite32); EXPORT_SYMBOL(iowrite32be); #ifdef CONFIG_64BIT static void pio_write64_lo_hi(u64 val, unsigned long port) { outl(val, port); outl(val >> 32, port + sizeof(u32)); } static void pio_write64_hi_lo(u64 val, unsigned long port) { outl(val >> 32, port + sizeof(u32)); outl(val, port); } static void pio_write64be_lo_hi(u64 val, unsigned long port) { pio_write32be(val, port + sizeof(u32)); pio_write32be(val >> 32, port); } static void pio_write64be_hi_lo(u64 val, unsigned long port) { pio_write32be(val >> 32, port); pio_write32be(val, port + sizeof(u32)); } void __iowrite64_lo_hi(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_lo_hi(val, port), writeq(val, addr)); } void __iowrite64_hi_lo(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_hi_lo(val, port), writeq(val, addr)); } void __iowrite64be_lo_hi(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_lo_hi(val, port), mmio_write64be(val, addr)); } void __iowrite64be_hi_lo(u64 val, void __iomem *addr) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_hi_lo(val, port), mmio_write64be(val, addr)); } EXPORT_SYMBOL(__iowrite64_lo_hi); EXPORT_SYMBOL(__iowrite64_hi_lo); EXPORT_SYMBOL(__iowrite64be_lo_hi); EXPORT_SYMBOL(__iowrite64be_hi_lo); #endif /* CONFIG_64BIT */ /* * These are the "repeat MMIO read/write" functions. * Note the "__raw" accesses, since we don't want to * convert to CPU byte order. We write in "IO byte * order" (we also don't have IO barriers). */ #ifndef mmio_insb static inline void mmio_insb(const void __iomem *addr, u8 *dst, int count) { while (--count >= 0) { u8 data = __raw_readb(addr); *dst = data; dst++; } } static inline void mmio_insw(const void __iomem *addr, u16 *dst, int count) { while (--count >= 0) { u16 data = __raw_readw(addr); *dst = data; dst++; } } static inline void mmio_insl(const void __iomem *addr, u32 *dst, int count) { while (--count >= 0) { u32 data = __raw_readl(addr); *dst = data; dst++; } } #endif #ifndef mmio_outsb static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count) { while (--count >= 0) { __raw_writeb(*src, addr); src++; } } static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count) { while (--count >= 0) { __raw_writew(*src, addr); src++; } } static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) { while (--count >= 0) { __raw_writel(*src, addr); src++; } } #endif void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); /* KMSAN must treat values read from devices as initialized. */ kmsan_unpoison_memory(dst, count); } void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); /* KMSAN must treat values read from devices as initialized. */ kmsan_unpoison_memory(dst, count * 2); } void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); /* KMSAN must treat values read from devices as initialized. */ kmsan_unpoison_memory(dst, count * 4); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(src, count); IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(src, count * 2); IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { /* Make sure uninitialized memory isn't copied to devices. */ kmsan_check_memory(src, count * 4); IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); } EXPORT_SYMBOL(iowrite8_rep); EXPORT_SYMBOL(iowrite16_rep); EXPORT_SYMBOL(iowrite32_rep); #ifdef CONFIG_HAS_IOPORT_MAP /* Create a virtual mapping cookie for an IO port range */ void __iomem *ioport_map(unsigned long port, unsigned int nr) { if (port > PIO_MASK) return NULL; return (void __iomem *) (unsigned long) (port + PIO_OFFSET); } void ioport_unmap(void __iomem *addr) { /* Nothing to do */ } EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); #endif /* CONFIG_HAS_IOPORT_MAP */ #ifdef CONFIG_PCI /* Hide the details if this is a MMIO or PIO address space and just do what * you expect in the correct way. */ void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { IO_COND(addr, /* nothing */, iounmap(addr)); } EXPORT_SYMBOL(pci_iounmap); #endif /* CONFIG_PCI */
1302 1298 1116 1181 149 18 41 222 15 463 544 546 40 515 334 122 245 7 9 370 626 624 626 561 626 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/neighbour_tables.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_INTERVAL_PROBE_TIME_MS, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); WRITE_ONCE(p->data[index], val); } #define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) #define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr)) #define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr))) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct hlist_node hash; struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; unsigned int arp_queue_len_bytes; struct sk_buff_head arp_queue; struct timer_list timer; unsigned long used; atomic_t probes; u8 nud_state; u8 type; u8 dead; u8 protocol; u32 flags; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; struct list_head managed_list; struct rcu_head rcu; struct net_device *dev; netdevice_tracker dev_tracker; u8 primary_key[]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; union { struct list_head free_node; struct rcu_head rcu; }; u32 flags; u8 protocol; bool permanent; u32 key[]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; unsigned int entry_size; unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct delayed_work managed_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; spinlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct mutex phash_lock; struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE BIT(0) #define NEIGH_UPDATE_F_WEAK_OVERRIDE BIT(1) #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER BIT(2) #define NEIGH_UPDATE_F_USE BIT(3) #define NEIGH_UPDATE_F_MANAGED BIT(4) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) #define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 #define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) #define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; #define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) #define neigh_for_each_in_bucket_rcu(pos, head) \ hlist_for_each_entry_rcu(pos, head, hash) #define neigh_for_each_in_bucket_safe(pos, tmp, head) \ hlist_for_each_entry_safe(pos, tmp, head, hash) static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } static inline void neigh_confirm(struct neighbour *n) { if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); static inline void neigh_set_reach_time(struct neigh_parms *p) { unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME); WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base)); } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, u32 flags, u8 protocol, bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static __always_inline int neigh_event_send_probe(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))) return __neigh_event_send(neigh, skb, immediate_ok); return 0; } static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { return neigh_event_send_probe(neigh, skb, true); } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { const struct hh_cache *hh = &n->hh; /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); return READ_ONCE(n->output)(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { u8 ndm_flags = 0; ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0; if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) { if (ndm_flags & NTF_ROUTER) neigh->flags |= NTF_ROUTER; else neigh->flags &= ~NTF_ROUTER; *notify = 1; } } #endif
7 9 2 7 1 1 27595 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> * * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * * Parts of the original code have been moved to arch/x86/vdso/vma.c * * This file implements vsyscall emulation. vsyscalls are a legacy ABI: * Userspace can request certain kernel services by calling fixed * addresses. This concept is problematic: * * - It interferes with ASLR. * - It's awkward to write code that lives in kernel addresses but is * callable by userspace at fixed addresses. * - The whole concept is impossible for 32-bit compat userspace. * - UML cannot easily virtualize a vsyscall. * * As of mid-2014, I believe that there is no new userspace code that * will use a vsyscall if the vDSO is present. I hope that there will * soon be no new userspace code that will ever use a vsyscall. * * The code in this file emulates vsyscalls when notified of a page * fault to a vsyscall address. */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> #include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else #error VSYSCALL config is broken #endif static int __init vsyscall_setup(char *str) { if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; else if (!strcmp("xonly", str)) vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else return -EINVAL; return 0; } return -EINVAL; } early_param("vsyscall", vsyscall_setup); static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { if (!show_unhandled_signals) return; printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) { int nr; if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; if (nr >= 3) return -EINVAL; return nr; } static bool write_ok_or_segv(unsigned long ptr, size_t size) { if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = &current->thread; thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { unsigned long caller; int vsyscall_nr, syscall_nr, tmp; long ret; unsigned long orig_dx; /* Write faults or kernel-privilege faults never get fixed up. */ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; /* * Assume that faults at regs->ip are because of an * instruction fetch. Return early and avoid * emulation for faults during data accesses: */ if (address != regs->ip) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; /* * User code tried and failed to read the vsyscall page. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); return false; } /* * X86_PF_INSTR is only set when NX is supported. When * available, use it to double-check that the emulation code * is only being used for instruction fetches: */ if (cpu_feature_enabled(X86_FEATURE_NX)) WARN_ON_ONCE(!(error_code & X86_PF_INSTR)); /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. */ if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); return false; } vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } /* * Check for access_ok violations and find the syscall nr. * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ switch (vsyscall_nr) { case 0: if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_gettimeofday; break; case 1: if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_time; break; case 2: if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_getcpu; break; } /* * Handle seccomp. regs->ip must be the original value. * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. * * We could optimize the seccomp disabled case, but performance * here doesn't matter. */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ /* * With a real vsyscall, page faults cause SIGSEGV. */ ret = -EFAULT; switch (vsyscall_nr) { case 0: /* this decodes regs->di and regs->si on its own */ ret = __x64_sys_gettimeofday(regs); break; case 1: /* this decodes regs->di on its own */ ret = __x64_sys_time(regs); break; case 2: /* while we could clobber regs->dx, we didn't in the past... */ orig_dx = regs->dx; regs->dx = 0; /* this decodes regs->di, regs->si and regs->dx on its own */ ret = __x64_sys_getcpu(regs); regs->dx = orig_dx; break; } check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); goto sigsegv; } regs->ax = ret; do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; return true; sigsegv: force_sig(SIGSEGV); return true; } /* * A pseudo VMA to allow ptrace access for the vsyscall page. This only * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, .vm_flags = VM_READ | VM_EXEC, .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) return NULL; #endif if (vsyscall_mode == NONE) return NULL; return &gate_vma; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; return (addr >= vma->vm_start) && (addr < vma->vm_end); } /* * Use this when you have no reliable mm, typically from interrupt * context. It is less reliable than using a task's mm and may give * false positives. */ int in_gate_area_no_mm(unsigned long addr) { return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } /* * The VSYSCALL page is the only user-accessible page in the kernel address * range. Normally, the kernel page tables can have _PAGE_USER clear, but * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls * are enabled. * * Some day we may create a "minimal" vsyscall mode in which we emulate * vsyscalls but leave the page not present. If so, we skip calling * this. */ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); } void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); /* * For full emulation, the page needs to exist for real. In * execute-only mode, there is no PTE at all backing the vsyscall * page. */ if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } if (vsyscall_mode == XONLY) vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ipv6.h> #include <net/dsfield.h> #include <net/xfrm.h> #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 static inline void xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; XFRM_MODE_SKB_CB(skb)->tos = iph->tos; XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); } static inline void xfrm6_extract_header(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph = ipv6_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = 0; XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; XFRM_MODE_SKB_CB(skb)->optlen = 0; memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); #else WARN_ON_ONCE(1); #endif } static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); iph->version = 6; memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(iph->flow_lbl)); iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; } static inline void xfrm4_beet_make_header(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->ihl = 5; iph->version = 4; iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; iph->tos = XFRM_MODE_SKB_CB(skb)->tos; iph->id = XFRM_MODE_SKB_CB(skb)->id; iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; } #endif
73 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_SYNPROXY_SHARED_H #define _NF_SYNPROXY_SHARED_H #include <linux/module.h> #include <linux/skbuff.h> #include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> struct synproxy_stats { unsigned int syn_received; unsigned int cookie_invalid; unsigned int cookie_valid; unsigned int cookie_retrans; unsigned int conn_reopened; }; struct synproxy_net { struct nf_conn *tmpl; struct synproxy_stats __percpu *stats; unsigned int hook_ref4; unsigned int hook_ref6; }; extern unsigned int synproxy_net_id; static inline struct synproxy_net *synproxy_pernet(struct net *net) { return net_generic(net, synproxy_net_id); } struct synproxy_options { u8 options; u8 wscale; u16 mss_option; u16 mss_encode; u32 tsval; u32 tsecr; }; struct nf_synproxy_info; bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts); void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts); void synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts); bool synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq); struct nf_hook_state; unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs); int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net); void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net); #if IS_ENABLED(CONFIG_IPV6) void synproxy_send_client_synack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts); bool synproxy_recv_client_ack_ipv6(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq); unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs); int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net); void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net); #else static inline int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { return 0; } static inline void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) {}; #endif /* CONFIG_IPV6 */ #endif /* _NF_SYNPROXY_SHARED_H */
25 25 24 25 25 25 9 9 9 9 9 1305 1305 1303 1308 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 // SPDX-License-Identifier: GPL-2.0-only /* * mm/truncate.c - code for taking down pages from address_spaces * * Copyright (C) 2002, Linus Torvalds * * 10Sep2002 Andrew Morton * Initial version. */ #include <linux/kernel.h> #include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" static void clear_shadow_entries(struct address_space *mapping, unsigned long start, unsigned long max) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; /* Handled by shmem itself, or for DAX we do nothing. */ if (shmem_mapping(mapping) || dax_mapping(mapping)) return; xas_set_update(&xas, workingset_update_node); spin_lock(&mapping->host->i_lock); xas_lock_irq(&xas); /* Clear all shadow entries from start to max */ xas_for_each(&xas, folio, max) { if (xa_is_value(folio)) xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } /* * Unconditionally remove exceptional entries. Usually called from truncate * path. Note that the folio_batch may be altered by this function by removing * exceptional entries similar to what folio_batch_remove_exceptionals() does. * Please note that indices[] has entries in ascending order as guaranteed by * either find_get_entries() or find_lock_entries(). */ static void truncate_folio_batch_exceptionals(struct address_space *mapping, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, indices[0]); int nr = folio_batch_count(fbatch); struct folio *folio; int i, j; /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; for (j = 0; j < nr; j++) if (xa_is_value(fbatch->folios[j])) break; if (j == nr) return; if (dax_mapping(mapping)) { for (i = j; i < nr; i++) { if (xa_is_value(fbatch->folios[i])) { /* * File systems should already have called * dax_break_layout_entry() to remove all DAX * entries while holding a lock to prevent * establishing new entries. Therefore we * shouldn't find any here. */ WARN_ON_ONCE(1); /* * Delete the mapping so truncate_pagecache() * doesn't loop forever. */ dax_delete_mapping_entry(mapping, indices[i]); } } goto out; } xas_set(&xas, indices[j]); xas_set_update(&xas, workingset_update_node); spin_lock(&mapping->host->i_lock); xas_lock_irq(&xas); xas_for_each(&xas, folio, indices[nr-1]) { if (xa_is_value(folio)) xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); out: folio_batch_remove_exceptionals(fbatch); } /** * folio_invalidate - Invalidate part or all of a folio. * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * * folio_invalidate() is called when all or part of the folio has become * invalidated by a truncate operation. * * folio_invalidate() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void folio_invalidate(struct folio *folio, size_t offset, size_t length) { const struct address_space_operations *aops = folio->mapping->a_ops; if (aops->invalidate_folio) aops->invalidate_folio(folio, offset, length); } EXPORT_SYMBOL_GPL(folio_invalidate); /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bail out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ static void truncate_cleanup_folio(struct folio *folio) { if (folio_mapped(folio)) unmap_mapping_folio(folio); if (folio_needs_release(folio)) folio_invalidate(folio, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ folio_cancel_dirty(folio); } int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { if (folio->mapping != mapping) return -EIO; truncate_cleanup_folio(folio); filemap_remove_folio(folio); return 0; } static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at, unsigned long min_order) { enum ttu_flags ttu_flags = TTU_SYNC | TTU_SPLIT_HUGE_PMD | TTU_IGNORE_MLOCK; int ret; ret = try_folio_split_to_order(folio, split_at, min_order); /* * If the split fails, unmap the folio, so it will be refaulted * with PTEs to respect SIGBUS semantics. * * Make an exception for shmem/tmpfs that for long time * intentionally mapped with PMDs across i_size. */ if (ret && !shmem_mapping(folio->mapping)) { try_to_unmap(folio, ttu_flags); WARN_ON(folio_mapped(folio)); } return ret; } /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the * folio that's within the [start, end] range, and then split the folio if * it's large. split_page_range() will discard pages which now lie beyond * i_size, and we rely on the caller to discard pages which lie within a * newly created hole. * * Returns false if splitting failed so the caller can avoid * discarding the entire folio which is stubbornly unsplit. */ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; unsigned int min_order; if (pos < start) offset = start - pos; else offset = 0; if (pos + size <= (u64)end) length = size - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); if (length == size) { truncate_inode_folio(folio->mapping, folio); return true; } /* * We may be zeroing pages we're about to discard, but it avoids * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ if (!mapping_inaccessible(folio->mapping)) folio_zero_range(folio, offset, length); if (folio_needs_release(folio)) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); if (!try_folio_split_or_unmap(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste * for shmem truncate */ struct folio *folio2; if (offset + length == size) goto no_split; split_at2 = folio_page(folio, PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); folio2 = page_folio(split_at2); if (!folio_try_get(folio2)) goto no_split; if (!folio_test_large(folio2)) goto out; if (!folio_trylock(folio2)) goto out; /* make sure folio2 is large and does not change its mapping */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) try_folio_split_or_unmap(folio2, split_at2, min_order); folio_unlock(folio2); out: folio_put(folio2); no_split: return true; } if (folio_test_dirty(folio)) return false; truncate_inode_folio(folio->mapping, folio); return true; } /* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_folio(struct address_space *mapping, struct folio *folio) { if (!mapping) return -EINVAL; /* * Only punch for normal data pages for now. * Handling other types like directories would need more auditing. */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; return truncate_inode_folio(mapping, folio); } EXPORT_SYMBOL(generic_error_remove_folio); /** * mapping_evict_folio() - Remove an unused folio from the page-cache. * @mapping: The mapping this folio belongs to. * @folio: The folio to remove. * * Safely remove one folio from the page cache. * It only drops clean, unused folios. * * Context: Folio must be locked. * Return: The number of pages successfully removed. */ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) { /* The page may have been truncated before it was locked */ if (!mapping) return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); } /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, uoff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; struct folio *folio; bool same_folio; if (mapping_empty(mapping)) return; /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_SHIFT; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); delete_from_page_cache_batch(mapping, &fbatch); for (i = 0; i < folio_batch_count(&fbatch); i++) folio_unlock(fbatch.folios[i]); folio_batch_release(&fbatch); cond_resched(); } same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { same_folio = lend < folio_next_pos(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) { folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } } index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) continue; folio_lock(folio); VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); } } EXPORT_SYMBOL(truncate_inode_pages_range); /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_rwsem and * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of * deletion (inside __filemap_remove_folio()) in the specified range. Thus * mapping->nrpages can be non-zero when this function returns even after * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { truncate_inode_pages_range(mapping, lstart, (loff_t)-1); } EXPORT_SYMBOL(truncate_inode_pages); /** * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * * Called under (and serialized by) inode->i_rwsem. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. */ void truncate_inode_pages_final(struct address_space *mapping) { /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the * inode teardown. Tell it when the address space is exiting, * so that it does not install eviction information after the * final truncate has begun. */ mapping_set_exiting(mapping); if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree * modification that does not see AS_EXITING is * completed before starting the final truncate. */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); } truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); /** * mapping_try_invalidate - Invalidate all the evictable folios of one inode * @mapping: the address_space which holds the folios to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * @nr_failed: How many folio invalidations failed * * This function is similar to invalidate_mapping_pages(), except that it * returns the number of folios which could not be evicted in @nr_failed. */ unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; folio_batch_init(&fbatch); while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { xa_has_values = true; count++; continue; } ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); /* * Invalidation is a hint that the folio is no longer * of interest and try to speed up its reclaim. */ if (!ret) { deactivate_file_folio(folio); /* Likely in the lru cache of a remote CPU */ if (nr_failed) (*nr_failed)++; } count += ret; } if (xa_has_values) clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } return count; } /** * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function removes pages that are clean, unmapped and unlocked, * as well as shadow entries. It will not block on IO activity. * * If you want to remove all the pages of one inode, regardless of * their use and writeback state, use truncate_inode_pages(). * * Return: The number of indices that had their contents invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { return mapping_try_invalidate(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); static int folio_launder(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return 0; if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; return mapping->a_ops->launder_folio(folio); } /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave folios behind because * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp) { int ret; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); ret = folio_launder(mapping, folio); if (ret) return ret; if (folio->mapping != mapping) return -EBUSY; if (!filemap_release_folio(folio, gfp)) return -EBUSY; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); if (folio_test_dirty(folio)) goto failed; BUG_ON(folio_has_private(folio)); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); return -EBUSY; } /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; if (mapping_empty(mapping)) return 0; folio_batch_init(&fbatch); index = start; while (find_get_entries(mapping, &index, end, &fbatch, indices)) { bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { xa_has_values = true; if (dax_mapping(mapping) && !dax_invalidate_mapping_entry_sync(mapping, indices[i])) ret = -EBUSY; continue; } if (!did_range_unmap && folio_mapped(folio)) { /* * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, indices[i], (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL); if (ret2 < 0) ret = ret2; folio_unlock(folio); } if (xa_has_values) clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { unmap_mapping_pages(mapping, start, end - start + 1, false); } return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); /** * invalidate_inode_pages2 - remove all pages from an address_space * @mapping: the address_space * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache(struct inode *inode, loff_t newsize) { struct address_space *mapping = inode->i_mapping; loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for * efficiency so that truncate_inode_pages does fewer * single-page unmaps. However after this first call, and * before truncate_inode_pages finishes, it is possible for * private pages to be COWed, which remain after * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ unmap_mapping_range(mapping, holebegin, 0, 1); truncate_inode_pages(mapping, newsize); unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); /** * truncate_setsize - update inode and pagecache for a new file size * @inode: inode * @newsize: new file size * * truncate_setsize updates i_size and performs pagecache truncation (if * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * * Must be called with a lock serializing truncates and writes (generally * i_rwsem but e.g. xfs uses a different lock) and before all filesystem * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { loff_t oldsize = inode->i_size; i_size_write(inode, newsize); if (newsize > oldsize) pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** * pagecache_isize_extended - update pagecache after extension of i_size * @inode: inode for which i_size was extended * @from: original inode size * @to: new inode size * * Handle extension of inode size either caused by extending truncate or * by write starting after current i_size. We mark the page straddling * current i_size RO so that page_mkwrite() is called on the first * write access to the page. The filesystem will update its per-block * information before user writes to the page via mmap after the i_size * has been changed. * * The function must be called after i_size is updated so that page fault * coming after we unlock the folio will already see the new i_size. * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = i_blocksize(inode); loff_t rounded_from; struct folio *folio; WARN_ON(to > inode->i_size); if (from >= to || bsize >= PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE); /* Folio not cached? Nothing to do */ if (IS_ERR(folio)) return; /* * See folio_clear_dirty_for_io() for details why folio_mark_dirty() * is needed. */ if (folio_mkclean(folio)) folio_mark_dirty(folio); /* * The post-eof range of the folio must be zeroed before it is exposed * to the file. Writeback normally does this, but since i_size has been * increased we handle it here. */ if (folio_test_dirty(folio)) { unsigned int offset, end; offset = from - folio_pos(folio); end = min_t(unsigned int, to - folio_pos(folio), folio_size(folio)); folio_zero_segment(folio, offset, end); } folio_unlock(folio); folio_put(folio); } EXPORT_SYMBOL(pagecache_isize_extended); /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole * @lend: offset of last byte of hole * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; loff_t unmap_start = round_up(lstart, PAGE_SIZE); loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; /* * This rounding is currently just for example: unmap_mapping_range * expands its hole outwards, whereas we want it to contract the hole * inwards. However, existing callers of truncate_pagecache_range are * doing their own page rounding first. Note that unmap_mapping_range * allows holelen 0 for all, and we allow lend -1 for end of file. */ /* * Unlike in truncate_pagecache, unmap_mapping_range is called only * once (before truncating pagecache), and without "even_cows" flag: * hole-punching should not remove private COWed pages from the hole. */ if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); truncate_inode_pages_range(mapping, lstart, lend); } EXPORT_SYMBOL(truncate_pagecache_range);
1125 1126 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2013-2014 Intel Corp. */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/debugfs.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/netdev_lock.h> #include <net/pkt_sched.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/6lowpan.h> /* for the compression support */ #define VERSION "0.1" static struct dentry *lowpan_enable_debugfs; static struct dentry *lowpan_control_debugfs; #define IFACE_NAME_TEMPLATE "bt%d" struct skb_cb { struct in6_addr addr; struct in6_addr gw; struct l2cap_chan *chan; }; #define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) /* The devices list contains those devices that we are acting * as a proxy. The BT 6LoWPAN device is a virtual device that * connects to the Bluetooth LE device. The real connection to * BT device is done via l2cap layer. There exists one * virtual device / one BT 6LoWPAN network (=hciX device). * The list contains struct lowpan_dev elements. */ static LIST_HEAD(bt_6lowpan_devices); static DEFINE_SPINLOCK(devices_lock); static bool enable_6lowpan; /* We are listening incoming connections via this channel */ static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); enum { LOWPAN_PEER_CLOSING, LOWPAN_PEER_MAXBITS }; struct lowpan_peer { struct list_head list; struct rcu_head rcu; struct l2cap_chan *chan; /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS); }; struct lowpan_btle_dev { struct list_head list; struct hci_dev *hdev; struct net_device *netdev; struct list_head peers; atomic_t peer_count; /* number of items in peers list */ struct work_struct delete_netdev; struct delayed_work notify_peers; }; static inline struct lowpan_btle_dev * lowpan_btle_dev(const struct net_device *netdev) { return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv; } static inline void peer_add(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_add_rcu(&peer->list, &dev->peers); atomic_inc(&dev->peer_count); } static inline bool peer_del(struct lowpan_btle_dev *dev, struct lowpan_peer *peer) { list_del_rcu(&peer->list); kfree_rcu(peer, rcu); module_put(THIS_MODULE); if (atomic_dec_and_test(&dev->peer_count)) { BT_DBG("last peer"); return true; } return false; } static inline struct lowpan_peer * __peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan == chan) return peer; } return NULL; } static inline struct lowpan_peer * __peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn) { struct lowpan_peer *peer; list_for_each_entry_rcu(peer, &dev->peers, list) { if (peer->chan->conn == conn) return peer; } return NULL; } static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; struct neighbour *neigh; BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); if (!rt) { if (ipv6_addr_any(&lowpan_cb(skb)->gw)) { /* There is neither route nor gateway, * probably the destination is a direct peer. */ nexthop = daddr; } else { /* There is a known gateway */ nexthop = &lowpan_cb(skb)->gw; } } else { nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the * destination routing info is not set. */ memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr)); } BT_DBG("gw %pI6c", nexthop); rcu_read_lock(); list_for_each_entry_rcu(peer, &dev->peers, list) { BT_DBG("dst addr %pMR dst type %u ip %pI6c", &peer->chan->dst, peer->chan->dst_type, &peer->peer_addr); if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) { rcu_read_unlock(); return peer; } } /* use the neighbour cache for matching addresses assigned by SLAAC */ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); if (neigh) { list_for_each_entry_rcu(peer, &dev->peers, list) { if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) { neigh_release(neigh); rcu_read_unlock(); return peer; } } neigh_release(neigh); } rcu_read_unlock(); return NULL; } static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { peer = __peer_lookup_conn(entry, conn); if (peer) break; } rcu_read_unlock(); return peer; } static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { if (conn->hcon->hdev == entry->hdev) { dev = entry; break; } } rcu_read_unlock(); return dev; } static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *skb_cp; skb_cp = skb_copy(skb, GFP_ATOMIC); if (!skb_cp) return NET_RX_DROP; return netif_rx(skb_cp); } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct lowpan_peer *peer) { const u8 *saddr; saddr = peer->lladdr; return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr); } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, struct lowpan_peer *peer) { struct sk_buff *local_skb; int ret; if (!netif_running(dev)) goto drop; if (dev->type != ARPHRD_6LOWPAN || !skb->len) goto drop; skb_reset_network_header(skb); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto drop; /* check that it's our buffer */ if (lowpan_is_ipv6(*skb_network_header(skb))) { /* Pull off the 1-byte of 6lowpan header. */ skb_pull(skb, 1); /* Copy the packet so that the IPv6 header is * properly aligned. */ local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1, skb_tailroom(skb), GFP_ATOMIC); if (!local_skb) goto drop; local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; skb_reset_mac_header(local_skb); skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else if (lowpan_is_iphc(*skb_network_header(skb))) { local_skb = skb_clone(skb, GFP_ATOMIC); if (!local_skb) goto drop; local_skb->dev = dev; ret = iphc_decompress(local_skb, dev, peer); if (ret < 0) { BT_DBG("iphc_decompress failed: %d", ret); kfree_skb(local_skb); goto drop; } local_skb->protocol = htons(ETH_P_IPV6); local_skb->pkt_type = PACKET_HOST; if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { kfree_skb(local_skb); goto drop; } dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; consume_skb(local_skb); consume_skb(skb); } else { BT_DBG("unknown packet type"); goto drop; } return NET_RX_SUCCESS; drop: dev->stats.rx_dropped++; return NET_RX_DROP; } /* Packet from BT LE device */ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { struct lowpan_btle_dev *dev; struct lowpan_peer *peer; int err; peer = lookup_peer(chan->conn); if (!peer) return -ENOENT; dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return -ENOENT; err = recv_pkt(skb, dev->netdev, peer); if (err) { BT_DBG("recv pkt %d", err); err = -EAGAIN; } return err; } static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { struct in6_addr ipv6_daddr; struct ipv6hdr *hdr; struct lowpan_btle_dev *dev; struct lowpan_peer *peer; u8 *daddr; int err, status = 0; hdr = ipv6_hdr(skb); dev = lowpan_btle_dev(netdev); memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; daddr = NULL; } else { BT_DBG("dest IP %pI6c", &ipv6_daddr); /* The packet might be sent to 6lowpan interface * because of routing (either via default route * or user set route) so get peer according to * the destination address. */ peer = peer_lookup_dst(dev, &ipv6_daddr, skb); if (!peer) { BT_DBG("no such peer"); return -ENOENT; } daddr = peer->lladdr; *peer_addr = peer->chan->dst; *peer_addr_type = peer->chan->dst_type; lowpan_cb(skb)->chan = peer->chan; status = 1; } lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr); err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); if (err < 0) return err; return status; } static int header_create(struct sk_buff *skb, struct net_device *netdev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { if (type != ETH_P_IPV6) return -EINVAL; return 0; } /* Packet to BT LE device */ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, struct net_device *netdev) { struct msghdr msg; struct kvec iv; int err; /* Remember the skb so that we can send EAGAIN to the caller if * we run out of credits. */ chan->data = skb; iv.iov_base = skb->data; iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len, NULL); if (err > 0) { netdev->stats.tx_bytes += err; netdev->stats.tx_packets++; return 0; } if (err < 0) netdev->stats.tx_errors++; return err; } static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; struct lowpan_btle_dev *entry; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { struct lowpan_peer *pentry; struct lowpan_btle_dev *dev; if (entry->netdev != netdev) continue; dev = lowpan_btle_dev(entry->netdev); list_for_each_entry_rcu(pentry, &dev->peers, list) { int ret; local_skb = skb_clone(skb, GFP_ATOMIC); BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &pentry->chan->dst, pentry->chan->dst_type, &pentry->peer_addr, pentry->chan); ret = send_pkt(pentry->chan, local_skb, netdev); if (ret < 0) err = ret; kfree_skb(local_skb); } } rcu_read_unlock(); return err; } static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) { int err = 0; bdaddr_t addr; u8 addr_type; /* We must take a copy of the skb before we modify/replace the ipv6 * header as the header could be used elsewhere */ skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Return values from setup_header() * <0 - error, packet is dropped * 0 - this is a multicast packet * 1 - this is unicast packet */ err = setup_header(skb, netdev, &addr, &addr_type); if (err < 0) { kfree_skb(skb); return NET_XMIT_DROP; } if (err) { if (lowpan_cb(skb)->chan) { BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, &addr, addr_type, &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); } else { err = -ENOENT; } } else { /* We need to send the packet to every device behind this * interface. */ err = send_mcast_pkt(skb, netdev); } dev_kfree_skb(skb); if (err) BT_DBG("ERROR: xmit failed (%d)", err); return err < 0 ? NET_XMIT_DROP : err; } static int bt_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static const struct net_device_ops netdev_ops = { .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; static const struct header_ops header_ops = { .create = header_create, }; static void netdev_setup(struct net_device *dev) { dev->hard_header_len = 0; dev->needed_tailroom = 0; dev->flags = IFF_RUNNING | IFF_MULTICAST; dev->watchdog_timeo = 0; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->netdev_ops = &netdev_ops; dev->header_ops = &header_ops; dev->needs_free_netdev = true; } static const struct device_type bt_type = { .name = "bluetooth", }; static void ifup(struct net_device *netdev) { int err; rtnl_lock(); err = dev_open(netdev, NULL); if (err < 0) BT_INFO("iface %s cannot be opened (%d)", netdev->name, err); rtnl_unlock(); } static void ifdown(struct net_device *netdev) { rtnl_lock(); dev_close(netdev); rtnl_unlock(); } static void do_notify_peers(struct work_struct *work) { struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev, notify_peers.work); netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ } static bool is_bt_6lowpan(struct hci_conn *hcon) { if (hcon->type != LE_LINK) return false; if (!enable_6lowpan) return false; return true; } static struct l2cap_chan *chan_create(void) { struct l2cap_chan *chan; chan = l2cap_chan_create(); if (!chan) return NULL; l2cap_chan_set_defaults(chan); chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; chan->mode = L2CAP_MODE_LE_FLOWCTL; chan->imtu = 1280; return chan; } static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, struct lowpan_btle_dev *dev, bool new_netdev) { struct lowpan_peer *peer; peer = kzalloc_obj(*peer, GFP_ATOMIC); if (!peer) return NULL; peer->chan = chan; baswap((void *)peer->lladdr, &chan->dst); lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr); spin_lock(&devices_lock); INIT_LIST_HEAD(&peer->list); peer_add(dev, peer); spin_unlock(&devices_lock); /* Notifying peers about us needs to be done without locks held */ if (new_netdev) INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); return peer->chan; } static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; bdaddr_t addr; int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, netdev_setup); if (!netdev) return -ENOMEM; netdev->addr_assign_type = NET_ADDR_PERM; baswap(&addr, &chan->src); __dev_addr_set(netdev, &addr, sizeof(addr)); netdev->netdev_ops = &netdev_ops; SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); *dev = lowpan_btle_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); spin_lock(&devices_lock); INIT_LIST_HEAD(&(*dev)->list); list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); spin_unlock(&devices_lock); err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE); if (err < 0) { BT_INFO("register_netdev failed %d", err); spin_lock(&devices_lock); list_del_rcu(&(*dev)->list); spin_unlock(&devices_lock); free_netdev(netdev); goto out; } BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d", netdev->ifindex, &chan->dst, chan->dst_type, &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); return 0; out: return err; } static inline void chan_ready_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; bool new_netdev = false; dev = lookup_dev(chan->conn); BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev); if (!dev) { if (setup_netdev(chan, &dev) < 0) { l2cap_chan_del(chan, -ENOENT); return; } new_netdev = true; } if (!try_module_get(THIS_MODULE)) return; add_peer_chan(chan, dev, new_netdev); ifup(dev->netdev); } static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; chan = chan_create(); if (!chan) return NULL; chan->ops = pchan->ops; BT_DBG("chan %p pchan %p", chan, pchan); return chan; } static void delete_netdev(struct work_struct *work) { struct lowpan_btle_dev *entry = container_of(work, struct lowpan_btle_dev, delete_netdev); lowpan_unregister_netdev(entry->netdev); /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *entry; struct lowpan_btle_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); if (chan->conn && chan->conn->hcon) { if (!is_bt_6lowpan(chan->conn->hcon)) return; /* If conn is set, then the netdev is also there and we should * not remove it. */ remove = false; } spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { dev = lowpan_btle_dev(entry->netdev); peer = __peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); err = 0; BT_DBG("dev %p removing %speer %p", dev, last ? "last " : "1 ", peer); BT_DBG("chan %p orig refcnt %u", chan, kref_read(&chan->kref)); l2cap_chan_put(chan); break; } } if (!err && last && dev && !atomic_read(&dev->peer_count)) { spin_unlock(&devices_lock); cancel_delayed_work_sync(&dev->notify_peers); ifdown(dev->netdev); if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } } else { spin_unlock(&devices_lock); } } static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err) { BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn, state_to_string(state), err); } static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { struct sk_buff *skb; /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); return skb; } static void chan_suspend_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p suspend", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_stop_queue(dev->netdev); } static void chan_resume_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; BT_DBG("chan %p resume", chan); dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return; netif_wake_queue(dev->netdev); } static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) { return L2CAP_CONN_TIMEOUT; } static const struct l2cap_ops bt_6lowpan_chan_ops = { .name = "L2CAP 6LoWPAN channel", .new_connection = chan_new_conn_cb, .recv = chan_recv_cb, .close = chan_close_cb, .state_change = chan_state_change_cb, .ready = chan_ready_cb, .resume = chan_resume_cb, .suspend = chan_suspend_cb, .get_sndtimeo = chan_get_sndtimeo_cb, .alloc_skb = chan_alloc_skb_cb, .teardown = l2cap_chan_no_teardown, .defer = l2cap_chan_no_defer, .set_shutdown = l2cap_chan_no_set_shutdown, }; static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) { struct l2cap_chan *chan; int err; chan = chan_create(); if (!chan) return -EINVAL; chan->ops = &bt_6lowpan_chan_ops; err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, addr, dst_type, L2CAP_CONN_TIMEOUT); BT_DBG("chan %p err %d", chan, err); if (err < 0) l2cap_chan_put(chan); return err; } static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) { struct lowpan_peer *peer; BT_DBG("conn %p dst type %u", conn, dst_type); peer = lookup_peer(conn); if (!peer) return -ENOENT; BT_DBG("peer %p chan %p", peer, peer->chan); l2cap_chan_lock(peer->chan); l2cap_chan_close(peer->chan, ENOENT); l2cap_chan_unlock(peer->chan); return 0; } static struct l2cap_chan *bt_6lowpan_listen(void) { bdaddr_t *addr = BDADDR_ANY; struct l2cap_chan *chan; int err; if (!enable_6lowpan) return NULL; chan = chan_create(); if (!chan) return NULL; chan->ops = &bt_6lowpan_chan_ops; chan->state = BT_LISTEN; chan->src_type = BDADDR_LE_PUBLIC; atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); BT_DBG("chan %p src type %u", chan, chan->src_type); err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP)); if (err) { l2cap_chan_put(chan); BT_ERR("psm cannot be added err %d", err); return NULL; } return chan; } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, struct l2cap_conn **conn, bool disconnect) { struct hci_conn *hcon; struct hci_dev *hdev; int le_addr_type; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", &addr->b[5], &addr->b[4], &addr->b[3], &addr->b[2], &addr->b[1], &addr->b[0], addr_type); if (n < 7) return -EINVAL; if (disconnect) { /* The "disconnect" debugfs command has used different address * type constants than "connect" since 2015. Let's retain that * for now even though it's obviously buggy... */ *addr_type += 1; } switch (*addr_type) { case BDADDR_LE_PUBLIC: le_addr_type = ADDR_LE_DEV_PUBLIC; break; case BDADDR_LE_RANDOM: le_addr_type = ADDR_LE_DEV_RANDOM; break; default: return -EINVAL; } /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); if (!hcon) return -ENOENT; *conn = (struct l2cap_conn *)hcon->l2cap_data; BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type); return 0; } static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer; int nchans; /* l2cap_chan_close() cannot be called from RCU, and lock ordering * chan->lock > devices_lock prevents taking write side lock, so copy * then close. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) list_for_each_entry_rcu(peer, &entry->peers, list) clear_bit(LOWPAN_PEER_CLOSING, peer->flags); rcu_read_unlock(); do { struct l2cap_chan *chans[32]; int i; nchans = 0; spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { list_for_each_entry_rcu(peer, &entry->peers, list) { if (test_and_set_bit(LOWPAN_PEER_CLOSING, peer->flags)) continue; l2cap_chan_hold(peer->chan); chans[nchans++] = peer->chan; if (nchans >= ARRAY_SIZE(chans)) goto done; } } done: spin_unlock(&devices_lock); for (i = 0; i < nchans; ++i) { l2cap_chan_lock(chans[i]); l2cap_chan_close(chans[i], ENOENT); l2cap_chan_unlock(chans[i]); l2cap_chan_put(chans[i]); } } while (nchans); } struct set_enable { struct work_struct work; bool flag; }; static void do_enable_set(struct work_struct *work) { struct set_enable *set_enable = container_of(work, struct set_enable, work); if (!set_enable->flag || enable_6lowpan != set_enable->flag) /* Disconnect existing connections if 6lowpan is * disabled */ disconnect_all_peers(); enable_6lowpan = set_enable->flag; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } listen_chan = bt_6lowpan_listen(); mutex_unlock(&set_lock); kfree(set_enable); } static int lowpan_enable_set(void *data, u64 val) { struct set_enable *set_enable; set_enable = kzalloc_obj(*set_enable); if (!set_enable) return -ENOMEM; set_enable->flag = !!val; INIT_WORK(&set_enable->work, do_enable_set); schedule_work(&set_enable->work); return 0; } static int lowpan_enable_get(void *data, u64 *val) { *val = enable_6lowpan; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get, lowpan_enable_set, "%llu\n"); static ssize_t lowpan_control_write(struct file *fp, const char __user *user_buffer, size_t count, loff_t *position) { char buf[32]; size_t buf_size = min(count, sizeof(buf) - 1); int ret; bdaddr_t addr; u8 addr_type; struct l2cap_conn *conn = NULL; if (copy_from_user(buf, user_buffer, buf_size)) return -EFAULT; buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false); if (ret == -EINVAL) return ret; mutex_lock(&set_lock); if (listen_chan) { l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); listen_chan = NULL; } mutex_unlock(&set_lock); if (conn) { struct lowpan_peer *peer; if (!is_bt_6lowpan(conn->hcon)) return -EINVAL; peer = lookup_peer(conn); if (peer) { BT_DBG("6LoWPAN connection already exists"); return -EALREADY; } BT_DBG("conn %p dst %pMR type %d user %u", conn, &conn->hcon->dst, conn->hcon->dst_type, addr_type); } ret = bt_6lowpan_connect(&addr, addr_type); if (ret < 0) return ret; return count; } if (memcmp(buf, "disconnect ", 11) == 0) { ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true); if (ret < 0) return ret; ret = bt_6lowpan_disconnect(conn, addr_type); if (ret < 0) return ret; return count; } return count; } static int lowpan_control_show(struct seq_file *f, void *ptr) { struct lowpan_btle_dev *entry; struct lowpan_peer *peer; spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { list_for_each_entry(peer, &entry->peers, list) seq_printf(f, "%pMR (type %u)\n", &peer->chan->dst, peer->chan->dst_type); } spin_unlock(&devices_lock); return 0; } static int lowpan_control_open(struct inode *inode, struct file *file) { return single_open(file, lowpan_control_show, inode->i_private); } static const struct file_operations lowpan_control_fops = { .open = lowpan_control_open, .read = seq_read, .write = lowpan_control_write, .llseek = seq_lseek, .release = single_release, }; static void disconnect_devices(void) { struct lowpan_btle_dev *entry, *tmp, *new_dev; struct list_head devices; INIT_LIST_HEAD(&devices); /* We make a separate list of devices because the unregister_netdev() * will call device_event() which will also want to modify the same * devices list. */ rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { new_dev = kmalloc_obj(*new_dev, GFP_ATOMIC); if (!new_dev) break; new_dev->netdev = entry->netdev; INIT_LIST_HEAD(&new_dev->list); list_add_rcu(&new_dev->list, &devices); } rcu_read_unlock(); list_for_each_entry_safe(entry, tmp, &devices, list) { ifdown(entry->netdev); BT_DBG("Unregistering netdev %s %p", entry->netdev->name, entry->netdev); lowpan_unregister_netdev(entry->netdev); kfree(entry); } } static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct lowpan_btle_dev *entry; if (netdev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: spin_lock(&devices_lock); list_for_each_entry(entry, &bt_6lowpan_devices, list) { if (entry->netdev == netdev) { BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); break; } } spin_unlock(&devices_lock); break; } return NOTIFY_DONE; } static struct notifier_block bt_6lowpan_dev_notifier = { .notifier_call = device_event, }; static int __init bt_6lowpan_init(void) { lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable", 0644, bt_debugfs, NULL, &lowpan_enable_fops); lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644, bt_debugfs, NULL, &lowpan_control_fops); return register_netdevice_notifier(&bt_6lowpan_dev_notifier); } static void __exit bt_6lowpan_exit(void) { debugfs_remove(lowpan_enable_debugfs); debugfs_remove(lowpan_control_debugfs); if (listen_chan) { l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } disconnect_devices(); unregister_netdevice_notifier(&bt_6lowpan_dev_notifier); } module_init(bt_6lowpan_init); module_exit(bt_6lowpan_exit); MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>"); MODULE_DESCRIPTION("Bluetooth 6LoWPAN"); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL");
2074 8 2069 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 // SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * * (C) Copyright 1991-2000 Linus Torvalds * * We have a per-user structure to keep track of how many * processes, files etc the user has claimed, in order to be * able to have per-user limits for system resources. */ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/key.h> #include <linux/sched/user.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/proc_ns.h> #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc init_binfmt_misc = { .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), .enabled = true, .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), }; EXPORT_SYMBOL_GPL(init_binfmt_misc); #endif /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? */ struct user_namespace init_user_ns = { .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .gid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .projid_map = { { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, .nr_extents = 1, }, }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif #if IS_ENABLED(CONFIG_BINFMT_MISC) .binfmt_misc = &init_binfmt_misc, #endif }; EXPORT_SYMBOL_GPL(init_user_ns); /* * UID task count cache, to get fast user lookup in "alloc_uid" * when changing user ID's (ie setuid() and friends). */ #define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; static struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is * occasionally also taken from softirq/tasklet context, when * task-structs get RCU-freed. Hence all locking must be softirq-safe. * But free_uid() is also called with local interrupts disabled, and running * local_bh_enable() with local interrupts disabled is an error - we'll run * softirq callbacks, and they can unconditionally enable interrupts, and * the caller of free_uid() didn't expect that.. */ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* * These routines must be called with the uidhash spinlock held! */ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); } static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { refcount_inc(&user->__count); return user; } } return NULL; } static int user_epoll_alloc(struct user_struct *up) { #ifdef CONFIG_EPOLL return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); #else return 0; #endif } static void user_epoll_free(struct user_struct *up) { #ifdef CONFIG_EPOLL percpu_counter_destroy(&up->epoll_watches); #endif } /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); user_epoll_free(up); kmem_cache_free(uid_cachep, up); } /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). * * If the user_struct could not be found, return NULL. */ struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; spin_lock_irqsave(&uidhash_lock, flags); ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } void free_uid(struct user_struct *up) { unsigned long flags; if (!up) return; if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); } EXPORT_SYMBOL_GPL(free_uid); struct user_struct *alloc_uid(kuid_t uid) { struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; refcount_set(&new->__count, 1); if (user_epoll_alloc(new)) { kmem_cache_free(uid_cachep, new); return NULL; } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced * on adding the same user already.. */ spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); up = new; } spin_unlock_irq(&uidhash_lock); } return up; } static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); if (user_epoll_alloc(&root_user)) panic("root_user epoll percpu counter alloc failed"); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; } subsys_initcall(uid_cache_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2017 Nicira, Inc. */ #ifndef FLOW_H #define FLOW_H 1 #include <linux/cache.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/openvswitch.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/if_ether.h> #include <linux/in6.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/cpumask.h> #include <net/inet_ecn.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> #include <net/nsh.h> struct sk_buff; enum sw_flow_mac_proto { MAC_PROTO_NONE = 0, MAC_PROTO_ETHERNET, }; #define SW_FLOW_KEY_INVALID 0x80 #define MPLS_LABEL_DEPTH 3 /* Bit definitions for IPv6 Extension Header pseudo-field. */ enum ofp12_ipv6exthdr_flags { OFPIEH12_NONEXT = 1 << 0, /* "No next header" encountered. */ OFPIEH12_ESP = 1 << 1, /* Encrypted Sec Payload header present. */ OFPIEH12_AUTH = 1 << 2, /* Authentication header present. */ OFPIEH12_DEST = 1 << 3, /* 1 or 2 dest headers present. */ OFPIEH12_FRAG = 1 << 4, /* Fragment header present. */ OFPIEH12_ROUTER = 1 << 5, /* Router header present. */ OFPIEH12_HOP = 1 << 6, /* Hop-by-hop header present. */ OFPIEH12_UNREP = 1 << 7, /* Unexpected repeats encountered. */ OFPIEH12_UNSEQ = 1 << 8 /* Unexpected sequencing encountered. */ }; /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. */ #define TUN_METADATA_OFFSET(opt_len) \ (sizeof_field(struct sw_flow_key, tun_opts) - opt_len) #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) struct ovs_tunnel_info { struct metadata_dst *tun_dst; }; struct vlan_head { __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */ }; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ sizeof_field(struct sw_flow_key, recirc_id)) struct ovs_key_nsh { struct ovs_nsh_key_base base; __be32 context[NSH_MD1_CONTEXT_SIZE]; }; struct sw_flow_key { u8 tun_opts[IP_TUNNEL_OPTS_MAX]; u8 tun_opts_len; struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */ u8 tun_proto; /* Protocol of encapsulating tunnel. */ u32 ovs_flow_hash; /* Datapath computed hash value. */ u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ struct vlan_head vlan; struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; /* Filling a hole of two bytes. */ u8 ct_state; u8 ct_orig_proto; /* CT original direction tuple IP * protocol. */ union { struct { u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ u8 tos; /* IP ToS. */ u8 ttl; /* IP TTL/hop limit. */ u8 frag; /* One of OVS_FRAG_TYPE_*. */ } ip; }; u16 ct_zone; /* Conntrack zone. */ struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ __be16 flags; /* TCP flags. */ } tp; union { struct { struct { __be32 src; /* IP source address. */ __be32 dst; /* IP destination address. */ } addr; union { struct { __be32 src; __be32 dst; } ct_orig; /* Conntrack original direction fields. */ struct { u8 sha[ETH_ALEN]; /* ARP source hardware address. */ u8 tha[ETH_ALEN]; /* ARP target hardware address. */ } arp; }; } ipv4; struct { struct { struct in6_addr src; /* IPv6 source address. */ struct in6_addr dst; /* IPv6 destination address. */ } addr; __be32 label; /* IPv6 flow label. */ u16 exthdrs; /* IPv6 extension header flags */ union { struct { struct in6_addr src; struct in6_addr dst; } ct_orig; /* Conntrack original direction fields. */ struct { struct in6_addr target; /* ND target address. */ u8 sll[ETH_ALEN]; /* ND source link layer address. */ u8 tll[ETH_ALEN]; /* ND target link layer address. */ } nd; }; } ipv6; struct { u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ } mpls; struct ovs_key_nsh nsh; /* network service header */ }; struct { /* Connection tracking fields not packed above. */ struct { __be16 src; /* CT orig tuple tp src port. */ __be16 dst; /* CT orig tuple tp dst port. */ } orig_tp; u32 mark; struct ovs_key_ct_labels labels; } ct; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key) { return key->eth.type == htons(ETH_P_IPV6) && key->ip.proto == NEXTHDR_ICMP && key->tp.dst == 0 && (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)); } struct sw_flow_key_range { unsigned short int start; unsigned short int end; }; struct sw_flow_mask { int ref_count; struct rcu_head rcu; struct sw_flow_key_range range; struct sw_flow_key key; }; struct sw_flow_match { struct sw_flow_key *key; struct sw_flow_key_range range; struct sw_flow_mask *mask; }; #define MAX_UFID_LENGTH 16 /* 128 bits */ struct sw_flow_id { u32 ufid_len; union { u32 ufid[MAX_UFID_LENGTH / 4]; struct sw_flow_key *unmasked_key; }; }; struct sw_flow_actions { struct rcu_head rcu; size_t orig_len; /* From flow_cmd_new netlink actions size */ u32 actions_len; struct nlattr actions[]; }; struct sw_flow_stats { u64 packet_count; /* Number of packets matched. */ u64 byte_count; /* Number of bytes matched. */ unsigned long used; /* Last used time (in jiffies). */ spinlock_t lock; /* Lock for atomic stats update. */ __be16 tcp_flags; /* Union of seen TCP flags. */ }; struct sw_flow { struct rcu_head rcu; struct { struct hlist_node node[2]; u32 hash; } flow_table, ufid_table; int stats_last_writer; /* CPU id of the last writer on * 'stats[0]'. */ struct sw_flow_key key; struct sw_flow_id id; struct cpumask *cpu_used_mask; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; struct sw_flow_stats __rcu *stats[]; /* One for each CPU. First one * is allocated at flow creation time, * the rest are allocated on demand * while holding the 'stats[0].lock'. */ }; struct arp_eth_header { __be16 ar_hrd; /* format of hardware address */ __be16 ar_pro; /* format of protocol address */ unsigned char ar_hln; /* length of hardware address */ unsigned char ar_pln; /* length of protocol address */ __be16 ar_op; /* ARP opcode (command) */ /* Ethernet+IPv4 specific members. */ unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ unsigned char ar_sip[4]; /* sender IP address */ unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ unsigned char ar_tip[4]; /* target IP address */ } __packed; static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key) { return key->mac_proto & ~SW_FLOW_KEY_INVALID; } static inline u16 __ovs_mac_header_len(u8 mac_proto) { return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0; } static inline u16 ovs_mac_header_len(const struct sw_flow_key *key) { return __ovs_mac_header_len(ovs_key_mac_proto(key)); } static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) { return sfid->ufid_len; } static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid) { return !ovs_identifier_is_ufid(sfid); } void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, const struct sk_buff *); void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, unsigned long *used, __be16 *tcp_flags); void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log); #endif /* flow.h */
202 201 448 262 202 202 165 166 445 54 1127 1091 449 166 445 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc_obj(*failover); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2");
7172 7183 2969 7163 7199 7207 5706 7153 2077 2077 260 208 3 3 3 3 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 // SPDX-License-Identifier: GPL-2.0-only /* * lib/bitmap.c * Helper functions for bitmap.h. */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> /** * DOC: bitmap introduction * * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. * * The possible unused bits in the last, partially used word * of a bitmap are 'don't care'. The implementation makes * no particular effort to keep them zero. It ensures that * their value will not affect the results of any operation. * The bitmap operations that return Boolean (bitmap_empty, * for example) or scalar (bitmap_weight, for example) results * carefully filter out these unused bits from impacting their * results. * * The byte ordering of bitmaps is more natural on little * endian architectures. See the big-endian headers * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h * for the best explanations of this ordering. */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_equal); bool __bitmap_or_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, const unsigned long *bitmap3, unsigned int bits) { unsigned int k, lim = bits / BITS_PER_LONG; unsigned long tmp; for (k = 0; k < lim; ++k) { if ((bitmap1[k] | bitmap2[k]) != bitmap3[k]) return false; } if (!(bits % BITS_PER_LONG)) return true; tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k]; return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0; } void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); /** * __bitmap_shift_right - logical right shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting right (dividing) means moving bits in the MS -> LS bit * direction. Zeros are fed into the vacated MS positions and the * LS bits shifted off the bottom are lost. */ void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned shift, unsigned nbits) { unsigned k, lim = BITS_TO_LONGS(nbits); unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; unsigned long mask = BITMAP_LAST_WORD_MASK(nbits); for (k = 0; off + k < lim; ++k) { unsigned long upper, lower; /* * If shift is not word aligned, take lower rem bits of * word above and make them the top rem bits of result. */ if (!rem || off + k + 1 >= lim) upper = 0; else { upper = src[off + k + 1]; if (off + k + 1 == lim - 1) upper &= mask; upper <<= (BITS_PER_LONG - rem); } lower = src[off + k]; if (off + k == lim - 1) lower &= mask; lower >>= rem; dst[k] = lower | upper; } if (off) memset(&dst[lim - off], 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_right); /** * __bitmap_shift_left - logical left shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting left (multiplying) means moving bits in the LS -> MS * direction. Zeros are fed into the vacated LS bit positions * and those MS bits shifted off the top are lost. */ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { int k; unsigned int lim = BITS_TO_LONGS(nbits); unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; for (k = lim - off - 1; k >= 0; --k) { unsigned long upper, lower; /* * If shift is not word aligned, take upper rem bits of * word below and make them the bottom rem bits of result. */ if (rem && k > 0) lower = src[k - 1] >> (BITS_PER_LONG - rem); else lower = 0; upper = src[k] << rem; dst[k + off] = lower | upper; } if (off) memset(dst, 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_left); /** * bitmap_cut() - remove bit region from bitmap and right shift remaining bits * @dst: destination bitmap, might overlap with src * @src: source bitmap * @first: start bit of region to be removed * @cut: number of bits to remove * @nbits: bitmap size, in bits * * Set the n-th bit of @dst iff the n-th bit of @src is set and * n is less than @first, or the m-th bit of @src is set for any * m such that @first <= n < nbits, and m = n + @cut. * * In pictures, example for a big-endian 32-bit architecture: * * The @src bitmap is:: * * 31 63 * | | * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 * | | | | * 16 14 0 32 * * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is:: * * 31 63 * | | * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 * | | | * 14 (bit 17 0 32 * from @src) * * Note that @dst and @src might overlap partially or entirely. * * This is implemented in the obvious way, with a shift and carry * step for each moved bit. Optimisation is left as an exercise * for the compiler. */ void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits); unsigned long keep = 0, carry; int i; if (first % BITS_PER_LONG) { keep = src[first / BITS_PER_LONG] & (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG)); } memmove(dst, src, len * sizeof(*dst)); while (cut--) { for (i = first / BITS_PER_LONG; i < len; i++) { if (i < len - 1) carry = dst[i + 1] & 1UL; else carry = 0; dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1)); } } dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG); dst[first / BITS_PER_LONG] |= keep; } EXPORT_SYMBOL(bitmap_cut); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; } EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; } EXPORT_SYMBOL(__bitmap_xor); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(nbits); for (k = 0; k < nr; k++) dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]); } EXPORT_SYMBOL(__bitmap_replace); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return true; if (bits % BITS_PER_LONG) if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return true; return false; } EXPORT_SYMBOL(__bitmap_intersects); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_subset); #define BITMAP_WEIGHT(FETCH, bits) \ ({ \ unsigned int __bits = (bits), idx, w = 0; \ \ for (idx = 0; idx < __bits / BITS_PER_LONG; idx++) \ w += hweight_long(FETCH); \ \ if (__bits % BITS_PER_LONG) \ w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits)); \ \ w; \ }) unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { return BITMAP_WEIGHT(bitmap[idx], bits); } EXPORT_SYMBOL(__bitmap_weight); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_and); unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_andnot); unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits); } void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } EXPORT_SYMBOL(__bitmap_set); void __bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } } EXPORT_SYMBOL(__bitmap_clear); /** * bitmap_find_next_zero_area_off - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * @align_offset: Alignment offset for zero area. * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds plus @align_offset * is multiple of that power of 2. */ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset) { unsigned long index, end, i; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { start = i + 1; goto again; } return index; } EXPORT_SYMBOL(bitmap_find_next_zero_area_off); /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap * @pos: a bit position in @buf (0 <= @pos < @nbits) * @nbits: number of valid bit positions in @buf * * Map the bit at position @pos in @buf (of length @nbits) to the * ordinal of which set bit it is. If it is not set or if @pos * is not a valid bit position, map to -1. * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * * The bit positions 0 through @bits are valid positions in @buf. */ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits) { if (pos >= nbits || !test_bit(pos, buf)) return -1; return bitmap_weight(buf, pos); } /** * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap * @dst: remapped result * @src: subset to be remapped * @old: defines domain of map * @new: defines range of map * @nbits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * If either of the @old and @new bitmaps are empty, or if @src and * @dst point to the same location, then this routine copies @src * to @dst. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to @src, placing the result in * @dst, clearing any bits previously set in @dst. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @src comes into this routine * with bits 1, 5 and 7 set, then @dst should leave with bits 1, * 13 and 15 set. */ void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits) { unsigned int oldbit, w; if (dst == src) /* following doesn't handle inplace remaps */ return; bitmap_zero(dst, nbits); w = bitmap_weight(new, nbits); for_each_set_bit(oldbit, src, nbits) { int n = bitmap_pos_to_ord(old, oldbit, nbits); if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else set_bit(find_nth_bit(new, nbits, n % w), dst); } } EXPORT_SYMBOL(bitmap_remap); /** * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit * @oldbit: bit position to be mapped * @old: defines domain of map * @new: defines range of map * @bits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to bit position @oldbit, returning * the new bit position. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @oldbit is 5, then this routine * returns 13. */ int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits) { int w = bitmap_weight(new, bits); int n = bitmap_pos_to_ord(old, oldbit, bits); if (n < 0 || w == 0) return oldbit; else return find_nth_bit(new, bits, n % w); } EXPORT_SYMBOL(bitmap_bitremap); #ifdef CONFIG_NUMA /** * bitmap_onto - translate one bitmap relative to another * @dst: resulting translated bitmap * @orig: original untranslated bitmap * @relmap: bitmap relative to which translated * @bits: number of bits in each of these bitmaps * * Set the n-th bit of @dst iff there exists some m such that the * n-th bit of @relmap is set, the m-th bit of @orig is set, and * the n-th bit of @relmap is also the m-th _set_ bit of @relmap. * (If you understood the previous sentence the first time your * read it, you're overqualified for your current job.) * * In other words, @orig is mapped onto (surjectively) @dst, * using the map { <n, m> | the n-th bit of @relmap is the * m-th set bit of @relmap }. * * Any set bits in @orig above bit number W, where W is the * weight of (number of set bits in) @relmap are mapped nowhere. * In particular, if for all bits m set in @orig, m >= W, then * @dst will end up empty. In situations where the possibility * of such an empty result is not desired, one way to avoid it is * to use the bitmap_fold() operator, below, to first fold the * @orig bitmap over itself so that all its set bits x are in the * range 0 <= x < W. The bitmap_fold() operator does this by * setting the bit (m % W) in @dst, for each bit (m) set in @orig. * * Example [1] for bitmap_onto(): * Let's say @relmap has bits 30-39 set, and @orig has bits * 1, 3, 5, 7, 9 and 11 set. Then on return from this routine, * @dst will have bits 31, 33, 35, 37 and 39 set. * * When bit 0 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the first bit (if any) * that is turned on in @relmap. Since bit 0 was off in the * above example, we leave off that bit (bit 30) in @dst. * * When bit 1 is set in @orig (as in the above example), it * means turn on the bit in @dst corresponding to whatever * is the second bit that is turned on in @relmap. The second * bit in @relmap that was turned on in the above example was * bit 31, so we turned on bit 31 in @dst. * * Similarly, we turned on bits 33, 35, 37 and 39 in @dst, * because they were the 4th, 6th, 8th and 10th set bits * set in @relmap, and the 4th, 6th, 8th and 10th bits of * @orig (i.e. bits 3, 5, 7 and 9) were also set. * * When bit 11 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the twelfth bit that is * turned on in @relmap. In the above example, there were * only ten bits turned on in @relmap (30..39), so that bit * 11 was set in @orig had no affect on @dst. * * Example [2] for bitmap_fold() + bitmap_onto(): * Let's say @relmap has these ten bits set:: * * 40 41 42 43 45 48 53 61 74 95 * * (for the curious, that's 40 plus the first ten terms of the * Fibonacci sequence.) * * Further lets say we use the following code, invoking * bitmap_fold() then bitmap_onto, as suggested above to * avoid the possibility of an empty @dst result:: * * unsigned long *tmp; // a temporary bitmap's bits * * bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits); * bitmap_onto(dst, tmp, relmap, bits); * * Then this table shows what various values of @dst would be, for * various @orig's. I list the zero-based positions of each set bit. * The tmp column shows the intermediate result, as computed by * using bitmap_fold() to fold the @orig bitmap modulo ten * (the weight of @relmap): * * =============== ============== ================= * @orig tmp @dst * 0 0 40 * 1 1 41 * 9 9 95 * 10 0 40 [#f1]_ * 1 3 5 7 1 3 5 7 41 43 48 61 * 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45 * 0 9 18 27 0 9 8 7 40 61 74 95 * 0 10 20 30 0 40 * 0 11 22 33 0 1 2 3 40 41 42 43 * 0 12 24 36 0 2 4 6 40 42 45 53 * 78 102 211 1 2 8 41 42 74 [#f1]_ * =============== ============== ================= * * .. [#f1] * * For these marked lines, if we hadn't first done bitmap_fold() * into tmp, then the @dst result would have been empty. * * If either of @orig or @relmap is empty (no set bits), then @dst * will be returned empty. * * If (as explained above) the only set bits in @orig are in positions * m where m >= W, (where W is the weight of @relmap) then @dst will * once again be returned empty. * * All bits in @dst not set by the above rule are cleared. */ void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits) { unsigned int n, m; /* same meaning as in above comment */ if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, bits); /* * The following code is a more efficient, but less * obvious, equivalent to the loop: * for (m = 0; m < bitmap_weight(relmap, bits); m++) { * n = find_nth_bit(orig, bits, m); * if (test_bit(m, orig)) * set_bit(n, dst); * } */ m = 0; for_each_set_bit(n, relmap, bits) { /* m == bitmap_pos_to_ord(relmap, n, bits) */ if (test_bit(m, orig)) set_bit(n, dst); m++; } } /** * bitmap_fold - fold larger bitmap into smaller, modulo specified size * @dst: resulting smaller bitmap * @orig: original larger bitmap * @sz: specified size * @nbits: number of bits in each of these bitmaps * * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst. * Clear all other bits in @dst. See further the comment and * Example [2] for bitmap_onto() for why and how to use this. */ void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits) { unsigned int oldbit; if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, nbits); for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } #endif /* CONFIG_NUMA */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags) { return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags); } EXPORT_SYMBOL(bitmap_alloc); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags) { return bitmap_alloc(nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL(bitmap_zalloc); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node) { return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags, node); } EXPORT_SYMBOL(bitmap_alloc_node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node) { return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node); } EXPORT_SYMBOL(bitmap_zalloc_node); void bitmap_free(const unsigned long *bitmap) { kfree(bitmap); } EXPORT_SYMBOL(bitmap_free); static void devm_bitmap_free(void *data) { unsigned long *bitmap = data; bitmap_free(bitmap); } unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags) { unsigned long *bitmap; int ret; bitmap = bitmap_alloc(nbits, flags); if (!bitmap) return NULL; ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap); if (ret) return NULL; return bitmap; } EXPORT_SYMBOL_GPL(devm_bitmap_alloc); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags) { return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL_GPL(devm_bitmap_zalloc); #if BITS_PER_LONG == 64 /** * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u32 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { bitmap[i/2] = (unsigned long) buf[i]; if (++i < halfwords) bitmap[i/2] |= ((unsigned long) buf[i]) << 32; } /* Clear tail bits in last word beyond nbits. */ if (nbits % BITS_PER_LONG) bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr32); /** * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits * @buf: array of u32 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { buf[i] = (u32) (bitmap[i/2] & UINT_MAX); if (++i < halfwords) buf[i] = (u32) (bitmap[i/2] >> 32); } /* Clear tail bits in last element of array beyond nbits. */ if (nbits % BITS_PER_LONG) buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31)); } EXPORT_SYMBOL(bitmap_to_arr32); #endif #if BITS_PER_LONG == 32 /** * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u64 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits) { int n; for (n = nbits; n > 0; n -= 64) { u64 val = *buf++; *bitmap++ = val; if (n > 32) *bitmap++ = val >> 32; } /* * Clear tail bits in the last word beyond nbits. * * Negative index is OK because here we point to the word next * to the last word of the bitmap, except for nbits == 0, which * is tested implicitly. */ if (nbits % BITS_PER_LONG) bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr64); /** * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits * @buf: array of u64 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits) { const unsigned long *end = bitmap + BITS_TO_LONGS(nbits); while (bitmap < end) { *buf = *bitmap++; if (bitmap < end) *buf |= (u64)(*bitmap++) << 32; buf++; } /* Clear tail bits in the last element of array beyond nbits. */ if (nbits % 64) buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0); } EXPORT_SYMBOL(bitmap_to_arr64); #endif
18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 /* mpicoder.c - Coder for the external representation of MPIs * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/count_zeros.h> #include <linux/export.h> #include <linux/scatterlist.h> #include <linux/string.h> #include "mpi-internal.h" #define MAX_EXTERN_MPI_BITS 16384 /** * mpi_read_raw_data - Read a raw byte stream as a positive integer * @xbuffer: The data to read * @nbytes: The amount of data to read */ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) { const uint8_t *buffer = xbuffer; int i, j; unsigned nbits, nlimbs; mpi_limb_t a; MPI val = NULL; while (nbytes > 0 && buffer[0] == 0) { buffer++; nbytes--; } nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); return NULL; } if (nbytes > 0) nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) return NULL; val->nbits = nbits; val->sign = 0; val->nlimbs = nlimbs; if (nbytes > 0) { i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; i %= BYTES_PER_MPI_LIMB; for (j = nlimbs; j > 0; j--) { a = 0; for (; i < BYTES_PER_MPI_LIMB; i++) { a <<= 8; a |= *buffer++; } i = 0; val->d[j - 1] = a; } } return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_data); MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; unsigned int nbits, nbytes; MPI val; if (*ret_nread < 2) return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); return ERR_PTR(-EINVAL); } nbytes = DIV_ROUND_UP(nbits, 8); if (nbytes + 2 > *ret_nread) { pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", nbytes, *ret_nread); return ERR_PTR(-EINVAL); } val = mpi_read_raw_data(buffer + 2, nbytes); if (!val) return ERR_PTR(-ENOMEM); *ret_nread = nbytes + 2; return val; } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); static int count_lzeros(MPI a) { mpi_limb_t alimb; int i, lzeros = 0; for (i = a->nlimbs - 1; i >= 0; i--) { alimb = a->d[i]; if (alimb == 0) { lzeros += sizeof(mpi_limb_t); } else { lzeros += count_leading_zeros(alimb) / 8; break; } } return lzeros; } /** * mpi_read_buffer() - read MPI to a buffer provided by user (msb first) * * @a: a multi precision integer * @buf: buffer to which the output will be written to. Needs to be at * least mpi_get_size(a) long. * @buf_len: size of the buf. * @nbytes: receives the actual length of the data written on success and * the data to-be-written on -EOVERFLOW in case buf_len was too * small. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign) { uint8_t *p; #if BYTES_PER_MPI_LIMB == 4 __be32 alimb; #elif BYTES_PER_MPI_LIMB == 8 __be64 alimb; #else #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); int i, lzeros; if (!buf || !nbytes) return -EINVAL; if (sign) *sign = a->sign; lzeros = count_lzeros(a); if (buf_len < n - lzeros) { *nbytes = n - lzeros; return -EOVERFLOW; } p = buf; *nbytes = n - lzeros; for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, lzeros %= BYTES_PER_MPI_LIMB; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 alimb = cpu_to_be32(a->d[i]); #elif BYTES_PER_MPI_LIMB == 8 alimb = cpu_to_be64(a->d[i]); #else #error please implement for this limb size. #endif memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros); p += BYTES_PER_MPI_LIMB - lzeros; lzeros = 0; } return 0; } EXPORT_SYMBOL_GPL(mpi_read_buffer); /* * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first). * Caller must free the return string. * This function does return a 0 byte buffer with nbytes set to zero if the * value of A is zero. * * @a: a multi precision integer. * @nbytes: receives the length of this buffer. * @sign: if not NULL, it will be set to the sign of the a. * * Return: Pointer to MPI buffer or NULL on error */ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign) { uint8_t *buf; unsigned int n; int ret; if (!nbytes) return NULL; n = mpi_get_size(a); if (!n) n++; buf = kmalloc(n, GFP_KERNEL); if (!buf) return NULL; ret = mpi_read_buffer(a, buf, n, nbytes, sign); if (ret) { kfree(buf); return NULL; } return buf; } EXPORT_SYMBOL_GPL(mpi_get_buffer); /** * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first) * * This function works in the same way as the mpi_read_buffer, but it * takes an sgl instead of u8 * buf. * * @a: a multi precision integer * @sgl: scatterlist to write to. Needs to be at least * mpi_get_size(a) long. * @nbytes: the number of bytes to write. Leading bytes will be * filled with zero. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes, int *sign) { u8 *p, *p2; #if BYTES_PER_MPI_LIMB == 4 __be32 alimb; #elif BYTES_PER_MPI_LIMB == 8 __be64 alimb; #else #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); struct sg_mapping_iter miter; int i, x, buf_len; int nents; if (sign) *sign = a->sign; if (nbytes < n) return -EOVERFLOW; nents = sg_nents_for_len(sgl, nbytes); if (nents < 0) return -EINVAL; sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG); sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; while (nbytes > n) { i = min_t(unsigned, nbytes - n, buf_len); memset(p2, 0, i); p2 += i; nbytes -= i; buf_len -= i; if (!buf_len) { sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; } } for (i = a->nlimbs - 1; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0; #elif BYTES_PER_MPI_LIMB == 8 alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0; #else #error please implement for this limb size. #endif p = (u8 *)&alimb; for (x = 0; x < sizeof(alimb); x++) { *p2++ = *p++; if (!--buf_len) { sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; } } } sg_miter_stop(&miter); return 0; } EXPORT_SYMBOL_GPL(mpi_write_to_sgl); /* * mpi_read_raw_from_sgl() - Function allocates an MPI and populates it with * data from the sgl * * This function works in the same way as the mpi_read_raw_data, but it * takes an sgl instead of void * buffer. i.e. it allocates * a new MPI and reads the content of the sgl to the MPI. * * @sgl: scatterlist to read from * @nbytes: number of bytes to read * * Return: Pointer to a new MPI or NULL on error */ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { struct sg_mapping_iter miter; unsigned int nbits, nlimbs; int x, j, z, lzeros, ents; unsigned int len; const u8 *buff; mpi_limb_t a; MPI val = NULL; ents = sg_nents_for_len(sgl, nbytes); if (ents < 0) return NULL; sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG); lzeros = 0; len = 0; while (nbytes > 0) { while (len && !*buff) { lzeros++; len--; buff++; } if (len && *buff) break; sg_miter_next(&miter); buff = miter.addr; len = miter.length; nbytes -= lzeros; lzeros = 0; } miter.consumed = lzeros; nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { sg_miter_stop(&miter); pr_info("MPI: mpi too large (%u bits)\n", nbits); return NULL; } if (nbytes > 0) nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8); sg_miter_stop(&miter); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) return NULL; val->nbits = nbits; val->sign = 0; val->nlimbs = nlimbs; if (nbytes == 0) return val; j = nlimbs - 1; a = 0; z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; z %= BYTES_PER_MPI_LIMB; while (sg_miter_next(&miter)) { buff = miter.addr; len = min(miter.length, nbytes); nbytes -= len; for (x = 0; x < len; x++) { a <<= 8; a |= *buff++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { val->d[j--] = a; a = 0; } } z += x; } return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) #define MLD_MAX_QUEUE 8 #define MLD_MAX_SKBS 32 static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif
1 1 1 1 1 1 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Linux VM pressure * * Copyright 2012 Linaro Ltd. * Anton Vorontsov <anton.vorontsov@linaro.org> * * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. */ #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> /* * The window size (vmpressure_win) is the number of scanned pages before * we try to analyze scanned/reclaimed ratio. So the window is used as a * rate-limit tunable for the "low" level notification, and also for * averaging the ratio for medium/critical levels. Using small window * sizes can cause lot of false positives, but too big window size will * delay the notifications. * * As the vmscan reclaimer logic works with chunks which are multiple of * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. * * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). */ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; /* * These thresholds are used when we account memory pressure through * scanned/reclaimed ratio. The current values were chosen empirically. In * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; /* * When there are too little pages left to scan, vmpressure() may miss the * critical pressure as number of pages will be less than "window size". * However, in that case the vmscan priority will raise fast as the * reclaimer will try to scan LRUs more deeply. * * The vmscan logic considers these special priorities: * * prio == DEF_PRIORITY (12): reclaimer starts with that value * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed * prio == 0 : close to OOM, kernel scans every page in an lru * * Any value in this range is acceptable for this tunable (i.e. from 12 to * 0). Current value for the vmpressure_level_critical_prio is chosen * empirically, but the number, in essence, means that we consider * critical level when scanning depth is ~10% of the lru size (vmscan * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one * eights). */ static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); static struct vmpressure *work_to_vmpressure(struct work_struct *work) { return container_of(work, struct vmpressure, work); } static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return memcg_to_vmpressure(memcg); } enum vmpressure_levels { VMPRESSURE_LOW = 0, VMPRESSURE_MEDIUM, VMPRESSURE_CRITICAL, VMPRESSURE_NUM_LEVELS, }; enum vmpressure_modes { VMPRESSURE_NO_PASSTHROUGH = 0, VMPRESSURE_HIERARCHY, VMPRESSURE_LOCAL, VMPRESSURE_NUM_MODES, }; static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", }; static const char * const vmpressure_str_modes[] = { [VMPRESSURE_NO_PASSTHROUGH] = "default", [VMPRESSURE_HIERARCHY] = "hierarchy", [VMPRESSURE_LOCAL] = "local", }; static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) return VMPRESSURE_CRITICAL; else if (pressure >= vmpressure_level_med) return VMPRESSURE_MEDIUM; return VMPRESSURE_LOW; } static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; unsigned long pressure = 0; /* * reclaimed can be greater than scanned for things such as reclaimed * slab pages. shrink_node() just adds reclaimed pages without a * related increment to scanned pages. */ if (reclaimed >= scanned) goto out; /* * We calculate the ratio (in percents) of how many pages were * scanned vs. reclaimed in a given time frame (window). Note that * time is in VM reclaimer's "ticks", i.e. number of pages * scanned. This makes it possible to set desired reaction time * and serves as a ratelimit. */ pressure = scale - (reclaimed * scale / scanned); pressure = pressure * 100 / scale; out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); return vmpressure_level(pressure); } struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; enum vmpressure_modes mode; struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr, const enum vmpressure_levels level, bool ancestor, bool signalled) { struct vmpressure_event *ev; bool ret = false; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ancestor && ev->mode == VMPRESSURE_LOCAL) continue; if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) continue; if (level < ev->level) continue; eventfd_signal(ev->efd); ret = true; } mutex_unlock(&vmpr->events_lock); return ret; } static void vmpressure_work_fn(struct work_struct *work) { struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; enum vmpressure_levels level; bool ancestor = false; bool signalled = false; spin_lock(&vmpr->sr_lock); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old * work context cleared the counters. In that case we will run * just after the old work returns, but then scanned might be zero * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } reclaimed = vmpr->tree_reclaimed; vmpr->tree_scanned = 0; vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); do { if (vmpressure_event(vmpr, level, ancestor, signalled)) signalled = true; ancestor = true; } while ((vmpr = vmpressure_parent(vmpr))); } /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * * This function should be called from the vmscan reclaim path to account * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * * If @tree is set, vmpressure is in traditional userspace reporting * mode: @memcg is considered the pressure root and userspace is * notified of the entire subtree's reclaim efficiency. * * If @tree is not set, reclaim efficiency is recorded for @memcg, and * only in-kernel users are notified. * * This function does not return any value. */ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; if (mem_cgroup_disabled()) return; /* * The in-kernel users only care about the reclaim efficiency * for this @memcg rather than the whole subtree, and there * isn't and won't be any in-kernel user in a legacy cgroup. */ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree) return; vmpr = memcg_to_vmpressure(memcg); /* * Here we only want to account pressure that userland is able to * help us with. For example, suppose that DMA zone is under * pressure; if we notify userland about that kind of pressure, * then it will be mostly a waste as it will trigger unnecessary * freeing of memory by userland (since userland is more likely to * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That * is why we include only movable, highmem and FS/IO pages. * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so * we account it too. */ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) return; /* * If we got here with no pages scanned, then that is an indicator * that reclaimer was unable to find any shrinkable LRUs at the * current scanning depth. But it does not mean that we should * report the critical pressure, yet. If the scanning priority * (scanning depth) goes too high (deep), we will be notified * through vmpressure_prio(). But so far, keep calm. */ if (!scanned) return; if (tree) { spin_lock(&vmpr->sr_lock); scanned = vmpr->tree_scanned += scanned; vmpr->tree_reclaimed += reclaimed; spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win) return; schedule_work(&vmpr->work); } else { enum vmpressure_levels level; /* For now, no users for root-level efficiency */ if (!memcg || mem_cgroup_is_root(memcg)) return; spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; if (scanned < vmpressure_win) { spin_unlock(&vmpr->sr_lock); return; } vmpr->scanned = vmpr->reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. * * For hysteresis keep the pressure state * asserted for a second in which subsequent * pressure events can occur. */ mem_cgroup_set_socket_pressure(memcg); } } } /** * vmpressure_prio() - Account memory pressure through reclaimer priority level * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @prio: reclaimer's priority * * This function should be called from the reclaim path every time when * the vmscan's reclaiming priority (scanning depth) changes. * * This function does not return any value. */ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) { /* * We only use prio for accounting critical level. For more info * see comment for vmpressure_level_critical_prio variable above. */ if (prio > vmpressure_level_critical_prio) return; /* * OK, the prio is below the threshold, updating vmpressure * information before shrinker dives into long shrinking of long * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 * to the vmpressure() basically means that we signal 'critical' * level. */ vmpressure(gfp, memcg, true, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with * @args: event arguments (pressure level threshold, optional mode) * * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the * @eventfd. The @args parameter is a comma-delimited string that denotes a * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. * "hierarchy" or "local"). * * To be used as memcg event method. * * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could * not be parsed. */ int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; enum vmpressure_levels level; char *spec, *spec_orig; char *token; int ret = 0; spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) return -ENOMEM; /* Find required level */ token = strsep(&spec, ","); ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); if (ret < 0) goto out; level = ret; /* Find optional mode */ token = strsep(&spec, ","); if (token) { ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); if (ret < 0) goto out; mode = ret; } ev = kzalloc_obj(*ev); if (!ev) { ret = -ENOMEM; goto out; } ev->efd = eventfd; ev->level = level; ev->mode = mode; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); ret = 0; out: kfree(spec_orig); return ret; } /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure * @memcg: memcg handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). * * To be used as memcg event method. */ void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ev->efd != eventfd) continue; list_del(&ev->node); kfree(ev); break; } mutex_unlock(&vmpr->events_lock); } /** * vmpressure_init() - Initialize vmpressure control structure * @vmpr: Structure to be initialized * * This function should be called on every allocated vmpressure structure * before any usage. */ void vmpressure_init(struct vmpressure *vmpr) { spin_lock_init(&vmpr->sr_lock); mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); } /** * vmpressure_cleanup() - shuts down vmpressure control structure * @vmpr: Structure to be cleaned up * * This function should be called before the structure in which it is * embedded is cleaned up. */ void vmpressure_cleanup(struct vmpressure *vmpr) { /* * Make sure there is no pending work before eventfd infrastructure * goes away. */ flush_work(&vmpr->work); }
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV4_H_ #define _NF_TABLES_IPV4_H_ #include <net/netfilter/nf_tables.h> #include <net/ip.h> static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) { struct iphdr *ip; ip = ip_hdr(pkt->skb); pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = ip->protocol; pkt->thoff = ip_hdrlen(pkt->skb); pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (skb_len < len) return -1; else if (len < thoff) return -1; else if (thoff < sizeof(*iph)) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; } static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) { struct iphdr *iph; u32 len, thoff; if (!pskb_may_pull(pkt->skb, sizeof(*iph))) return -1; iph = ip_hdr(pkt->skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } else if (len < thoff) { goto inhdr_error; } else if (thoff < sizeof(*iph)) { return -1; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; inhdr_error: __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INHDRERRORS); return -1; } #endif
8 8 8 398 399 398 398 399 22 12 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 * * X86-64 port * Andi Kleen. * * CPU hotplug support - ashok.raj@intel.com */ /* * This file handles the architecture-dependent parts of process handling.. */ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/kvm_types.h> #include <linux/ptrace.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> #include <linux/syscalls.h> #include <linux/iommu.h> #include <asm/processor.h> #include <asm/pkru.h> #include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> #include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> #include <asm/fred.h> #include <asm/msr.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> #endif #include "process.h" /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, es; show_iret_regs(regs, log_lvl); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else pr_cont("\n"); printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", log_lvl, regs->ax, regs->bx, regs->cx); printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", log_lvl, regs->dx, regs->si, regs->di); printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", log_lvl, regs->bp, regs->r8, regs->r9); printk("%sR10: %016lx R11: %016lx R12: %016lx\n", log_lvl, regs->r10, regs->r11, regs->r12); printk("%sR13: %016lx R14: %016lx R15: %016lx\n", log_lvl, regs->r13, regs->r14, regs->r15); if (mode == SHOW_REGS_SHORT) return; if (mode == SHOW_REGS_USER) { rdmsrq(MSR_FS_BASE, fs); rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); printk("%sFS: %016lx GS: %016lx\n", log_lvl, fs, shadowgs); return; } asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrq(MSR_FS_BASE, fs); rdmsrq(MSR_GS_BASE, gs); rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); cr4 = __read_cr4(); printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", log_lvl, d3, d6, d7); } if (cr4 & X86_CR4_PKE) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } void release_thread(struct task_struct *dead_task) { WARN_ON(dead_task->mm); } enum which_selector { FS, GS }; /* * Out of line to be protected from kprobes and tracing. If this would be * traced or probed than any access to a per CPU variable happens with * the wrong GS. * * It is not used on Xen paravirt. When paravirt support is needed, it * needs to be renamed with native_ prefix. */ static noinstr unsigned long __rdgsbase_inactive(void) { unsigned long gsbase; lockdep_assert_irqs_disabled(); /* * SWAPGS is no longer needed thus NOT allowed with FRED because * FRED transitions ensure that an operating system can _always_ * operate with its own GS base address: * - For events that occur in ring 3, FRED event delivery swaps * the GS base address with the IA32_KERNEL_GS_BASE MSR. * - ERETU (the FRED transition that returns to ring 3) also swaps * the GS base address with the IA32_KERNEL_GS_BASE MSR. * * And the operating system can still setup the GS segment for a * user thread without the need of loading a user thread GS with: * - Using LKGS, available with FRED, to modify other attributes * of the GS segment without compromising its ability always to * operate with its own GS base address. * - Accessing the GS segment base address for a user thread as * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. * * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE * MSR instead of the GS segment’s descriptor cache. As such, the * operating system never changes its runtime GS base address. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); } else { instrumentation_begin(); rdmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } return gsbase; } /* * Out of line to be protected from kprobes and tracing. If this would be * traced or probed than any access to a per CPU variable happens with * the wrong GS. * * It is not used on Xen paravirt. When paravirt support is needed, it * needs to be renamed with native_ prefix. */ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); if (!cpu_feature_enabled(X86_FEATURE_FRED) && !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); } else { instrumentation_begin(); wrmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } } /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function * is hot. */ static __always_inline void save_base_legacy(struct task_struct *prev_p, unsigned short selector, enum which_selector which) { if (likely(selector == 0)) { /* * On Intel (without X86_BUG_NULL_SEG), the segment base could * be the pre-existing saved base or it could be zero. On AMD * (with X86_BUG_NULL_SEG), the segment base could be almost * anything. * * This branch is very hot (it's hit twice on almost every * context switch between 64-bit programs), and avoiding * the RDMSR helps a lot, so we just assume that whatever * value is already saved is correct. This matches historical * Linux behavior, so it won't break existing applications. * * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we * report that the base is zero, it needs to actually be zero: * see the corresponding logic in load_seg_legacy. */ } else { /* * If the selector is 1, 2, or 3, then the base is zero on * !X86_BUG_NULL_SEG CPUs and could be anything on * X86_BUG_NULL_SEG CPUs. In the latter case, Linux * has never attempted to preserve the base across context * switches. * * If selector > 3, then it refers to a real segment, and * saving the base isn't necessary. */ if (which == FS) prev_p->thread.fsbase = 0; else prev_p->thread.gsbase = 0; } } static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); if (static_cpu_has(X86_FEATURE_FSGSBASE)) { /* * If FSGSBASE is enabled, we can't make any useful guesses * about the base, and user code expects us to save the current * value. Fortunately, reading the base directly is efficient. */ task->thread.fsbase = rdfsbase(); task->thread.gsbase = __rdgsbase_inactive(); } else { save_base_legacy(task, task->thread.fsindex, FS); save_base_legacy(task, task->thread.gsindex, GS); } } /* * While a process is running,current->thread.fsbase and current->thread.gsbase * may not match the corresponding CPU registers (see save_base_legacy()). */ void current_save_fsgs(void) { unsigned long flags; /* Interrupts need to be off for FSGSBASE */ local_irq_save(flags); save_fsgs(current); local_irq_restore(flags); } EXPORT_SYMBOL_FOR_KVM(current_save_fsgs); static __always_inline void loadseg(enum which_selector which, unsigned short sel) { if (which == FS) loadsegment(fs, sel); else load_gs_index(sel); } static __always_inline void load_seg_legacy(unsigned short prev_index, unsigned long prev_base, unsigned short next_index, unsigned long next_base, enum which_selector which) { if (likely(next_index <= 3)) { /* * The next task is using 64-bit TLS, is not using this * segment at all, or is having fun with arcane CPU features. */ if (next_base == 0) { /* * Nasty case: on AMD CPUs, we need to forcibly zero * the base. */ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { loadseg(which, __USER_DS); loadseg(which, next_index); } else { /* * We could try to exhaustively detect cases * under which we can skip the segment load, * but there's really only one case that matters * for performance: if both the previous and * next states are fully zeroed, we can skip * the load. * * (This assumes that prev_base == 0 has no * false positives. This is the case on * Intel-style CPUs.) */ if (likely(prev_index | next_index | prev_base)) loadseg(which, next_index); } } else { if (prev_index != next_index) loadseg(which, next_index); wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); } } else { /* * The next task is using a real segment. Loading the selector * is sufficient. */ loadseg(which, next_index); } } /* * Store prev's PKRU value and load next's PKRU value if they differ. PKRU * is not XSTATE managed on context switch because that would require a * lookup in the task's FPU xsave buffer and require to keep that updated * in various places. */ static __always_inline void x86_pkru_load(struct thread_struct *prev, struct thread_struct *next) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* Stash the prev task's value: */ prev->pkru = rdpkru(); /* * PKRU writes are slightly expensive. Avoid them when not * strictly necessary: */ if (prev->pkru != next->pkru) wrpkru(next->pkru); } static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { if (static_cpu_has(X86_FEATURE_FSGSBASE)) { /* Update the FS and GS selectors if they could have changed. */ if (unlikely(prev->fsindex || next->fsindex)) loadseg(FS, next->fsindex); if (unlikely(prev->gsindex || next->gsindex)) loadseg(GS, next->gsindex); /* Update the bases. */ wrfsbase(next->fsbase); __wrgsbase_inactive(next->gsbase); } else { load_seg_legacy(prev->fsindex, prev->fsbase, next->fsindex, next->fsbase, FS); load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex, next->gsbase, GS); } } unsigned long x86_fsgsbase_read_task(struct task_struct *task, unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; if (likely((selector & SEGMENT_TI_MASK) == 0)) { if (unlikely(idx >= GDT_ENTRIES)) return 0; /* * There are no user segments in the GDT with nonzero bases * other than the TLS segments. */ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return 0; idx -= GDT_ENTRY_TLS_MIN; base = get_desc_base(&task->thread.tls_array[idx]); } else { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* * If performance here mattered, we could protect the LDT * with RCU. This is a slow path, though, so we can just * take the mutex. */ mutex_lock(&task->mm->context.lock); ldt = task->mm->context.ldt; if (unlikely(!ldt || idx >= ldt->nr_entries)) base = 0; else base = get_desc_base(ldt->entries + idx); mutex_unlock(&task->mm->context.lock); #else base = 0; #endif } return base; } unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); gsbase = __rdgsbase_inactive(); local_irq_restore(flags); } else { rdmsrq(MSR_KERNEL_GS_BASE, gsbase); } return gsbase; } void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); __wrgsbase_inactive(gsbase); local_irq_restore(flags); } else { wrmsrq(MSR_KERNEL_GS_BASE, gsbase); } } unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); return fsbase; } unsigned long x86_gsbase_read_task(struct task_struct *task) { unsigned long gsbase; if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); return gsbase; } void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) { WARN_ON_ONCE(task == current); task->thread.fsbase = fsbase; } void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) { WARN_ON_ONCE(task == current); task->thread.gsbase = gsbase; } static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); if (static_cpu_has(X86_BUG_NULL_SEG)) { /* Loading zero below won't clear the base. */ loadsegment(fs, __USER_DS); load_gs_index(__USER_DS); } reset_thread_features(); loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; regs->csx = _cs; regs->ssx = _ss; /* * Allow single-step trap and NMI when starting a new task, thus * once the new task enters user space, single-step trap and NMI * are both enabled immediately. * * Entering a new task is logically speaking a return from a * system call (exec, fork, clone, etc.). As such, if ptrace * enables single stepping a single step exception should be * allowed to trigger immediately upon entering user space. * This is not optional. * * NMI should *never* be disabled in user space. As such, this * is an optional, opportunistic way to catch errors. * * Paranoia: High-order 48 bits above the lowest 16 bit SS are * discarded by the legacy IRET instruction on all Intel, AMD, * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, * even when FRED is not enabled. But we choose the safer side * to use these bits only when FRED is enabled. */ if (cpu_feature_enabled(X86_FEATURE_FRED)) { regs->fred_ss.swevent = true; regs->fred_ss.nmi = true; } regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { start_thread_common(regs, new_ip, new_sp, __USER_CS, __USER_DS, 0); } EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) { start_thread_common(regs, new_ip, new_sp, x32 ? __USER_CS : __USER32_CS, __USER_DS, __USER_DS); } #endif /* * switch_to(x,y) should switch tasks from x to y. * * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ __no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(hardirq_stack_inuse)); switch_fpu(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * * (e.g. xen_load_tls()) */ save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads * reference the correct GDT entries. */ load_TLS(next, cpu); /* * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before * loading segments that might reference them. */ arch_end_context_switch(next_p); /* Switch DS and ES. * * Reading them only returns the selectors, but writing them (if * nonzero) loads the full descriptor from the GDT or LDT. The * LDT for next is loaded in switch_mm, and the GDT is loaded * above. * * We therefore need to write new values to the segment * registers on every context switch unless both the new and old * values are zero. * * Note that we don't need to do anything for CS and SS, as * those are saved and restored as part of pt_regs. */ savesegment(es, prev->es); if (unlikely(next->es | prev->es)) loadsegment(es, next->es); savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); x86_fsgsbase_load(prev, next); x86_pkru_load(prev, next); /* * Switch the PDA and FPU contexts. */ raw_cpu_write(current_task, next_p); raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ update_task_stack(next_p); switch_to_extra(prev_p, next_p); if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but * does not update the cached descriptor. As a result, if we * do SYSRET while SS is NULL, we'll end up in user mode with * SS apparently equal to __USER_DS but actually unusable. * * The straightforward workaround would be to fix it up just * before SYSRET, but that would slow down the system call * fast paths. Instead, we ensure that SS is never NULL in * system call context. We do this by replacing NULL SS * selectors at every context switch. SYSCALL sets up a valid * SS, so the only way to get NULL is to re-enter the kernel * from CPL 3 through an interrupt. Since that can't happen * in the same task as a running syscall, we are guaranteed to * context switch between every interrupt vector entry and a * subsequent SYSRET. * * We read SS first because SS reads are much faster than * writes. Out of caution, we force SS to __KERNEL_DS even if * it previously had a different non-NULL value. */ unsigned short ss_sel; savesegment(ss, ss_sel); if (ss_sel != __KERNEL_DS) loadsegment(ss, __KERNEL_DS); } /* Load the Intel cache allocation PQR MSR. */ resctrl_arch_sched_in(next_p); /* Reset hw history on AMD CPUs */ if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS)) wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1); return prev_p; } void set_personality_64bit(void) { /* inherit personality from parent */ /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_ADDR32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; if (current->mm) __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that 32bit children are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } static void __set_personality_x32(void) { #ifdef CONFIG_X86_X32_ABI if (current->mm) current->mm->context.flags = 0; current->personality &= ~READ_IMPLIES_EXEC; /* * in_32bit_syscall() uses the presence of the x32 syscall bit * flag to determine compat status. The x86 mmap() code relies on * the syscall bitness so set x32 syscall bit right here to make * in_32bit_syscall() work during exec(). * * Pretend to come from a x32 execve. */ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; current_thread_info()->status &= ~TS_COMPAT; #endif } static void __set_personality_ia32(void) { #ifdef CONFIG_IA32_EMULATION if (current->mm) { /* * uprobes applied to this MM need to know this and * cannot use user_64bit_mode() at that time. */ __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags); } current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; current_thread_info()->status |= TS_COMPAT; #endif } void set_personality_ia32(bool x32) { /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); if (x32) __set_personality_x32(); else __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); #ifdef CONFIG_CHECKPOINT_RESTORE static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) { int ret; ret = map_vdso_once(image, addr); if (ret) return ret; return (long)image->size; } #endif #ifdef CONFIG_ADDRESS_MASKING #define LAM_U57_BITS 6 static void enable_lam_func(void *__mm) { struct mm_struct *mm = __mm; unsigned long lam; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { lam = mm_lam_cr3_mask(mm); write_cr3(__read_cr3() | lam); cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); } } static void mm_enable_lam(struct mm_struct *mm) { mm->context.lam_cr3_mask = X86_CR3_LAM_U57; mm->context.untag_mask = ~GENMASK(62, 57); /* * Even though the process must still be single-threaded at this * point, kernel threads may be using the mm. IPI those kernel * threads if they exist. */ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); } static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) return -ENODEV; /* PTRACE_ARCH_PRCTL */ if (current->mm != mm) return -EINVAL; if (mm_valid_pasid(mm) && !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; /* * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from * being enabled unless the process is single threaded: */ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; } if (!nr_bits || nr_bits > LAM_U57_BITS) { mmap_write_unlock(mm); return -EINVAL; } mm_enable_lam(mm); mmap_write_unlock(mm); return 0; } #endif long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; switch (option) { case ARCH_SET_GS: { if (unlikely(arg2 >= TASK_SIZE_MAX)) return -EPERM; preempt_disable(); /* * ARCH_SET_GS has always overwritten the index * and the base. Zero is the most sensible value * to put in the index, and is the only value that * makes any sense if FSGSBASE is unavailable. */ if (task == current) { loadseg(GS, 0); x86_gsbase_write_cpu_inactive(arg2); /* * On non-FSGSBASE systems, save_base_legacy() expects * that we also fill in thread.gsbase. */ task->thread.gsbase = arg2; } else { task->thread.gsindex = 0; x86_gsbase_write_task(task, arg2); } preempt_enable(); break; } case ARCH_SET_FS: { /* * Not strictly needed for %fs, but do it for symmetry * with %gs */ if (unlikely(arg2 >= TASK_SIZE_MAX)) return -EPERM; preempt_disable(); /* * Set the selector to 0 for the same reason * as %gs above. */ if (task == current) { loadseg(FS, 0); x86_fsbase_write_cpu(arg2); /* * On non-FSGSBASE systems, save_base_legacy() expects * that we also fill in thread.fsbase. */ task->thread.fsbase = arg2; } else { task->thread.fsindex = 0; x86_fsbase_write_task(task, arg2); } preempt_enable(); break; } case ARCH_GET_FS: { unsigned long base = x86_fsbase_read_task(task); ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base = x86_gsbase_read_task(task); ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdsox32_image, arg2); # endif # ifdef CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso32_image, arg2); # endif case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso64_image, arg2); #endif #ifdef CONFIG_ADDRESS_MASKING case ARCH_GET_UNTAG_MASK: return put_user(task->mm->context.untag_mask, (unsigned long __user *)arg2); case ARCH_ENABLE_TAGGED_ADDR: return prctl_enable_tagged_addr(task->mm, arg2); case ARCH_FORCE_TAGGED_SVA: if (current != task) return -EINVAL; set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); return 0; case ARCH_GET_MAX_TAG_BITS: if (!cpu_feature_enabled(X86_FEATURE_LAM)) return put_user(0, (unsigned long __user *)arg2); else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif case ARCH_SHSTK_ENABLE: case ARCH_SHSTK_DISABLE: case ARCH_SHSTK_LOCK: case ARCH_SHSTK_UNLOCK: case ARCH_SHSTK_STATUS: return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; } return ret; }
68 6 63 4 8 7 1 5 2 2 5 9 9 1 8 5 5 5 5 1 41 58 1 61 42 10 13 23 13 27 28 37 5 23 17 1 19 8 5 57 2 62 18 51 6 4 1 11 4 7 36 13 38 5 10 5 72 72 74 15 72 2 14 69 71 71 15 72 62 62 62 62 5 26 39 2 12 11 11 8 3 1 8 3 4 7 3 4 6 6 4 2 37 6 20 37 4 14 18 18 19 23 17 14 35 3 3 3 79 79 9 56 56 2 56 47 7 53 5 335 335 43 7 36 23 35 7 29 8 30 9 5 2 9 36 23 19 36 25 1 24 3 6 3 3 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/bsearch.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_array_interval { struct nft_set_ext *from; struct nft_set_ext *to; }; struct nft_array { u32 max_intervals; u32 num_intervals; struct nft_array_interval *intervals; struct rcu_head rcu_head; }; struct nft_rbtree { struct rb_root root; rwlock_t lock; struct nft_array __rcu *array; struct nft_array *array_next; unsigned long start_rbe_cookie; unsigned long last_gc; struct list_head expired; u64 last_tstamp; }; struct nft_rbtree_elem { struct nft_elem_priv priv; union { struct rb_node node; struct list_head list; }; struct nft_set_ext ext; }; static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) { return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) { return !nft_rbtree_interval_end(rbe); } static bool nft_rbtree_interval_null(const struct nft_set *set, const struct nft_rbtree_elem *rbe) { return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) && nft_rbtree_interval_end(rbe)); } static int nft_rbtree_cmp(const struct nft_set *set, const struct nft_rbtree_elem *e1, const struct nft_rbtree_elem *e2) { return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), set->klen); } struct nft_array_lookup_ctx { const u32 *key; u32 klen; }; static int nft_array_lookup_cmp(const void *pkey, const void *entry) { const struct nft_array_interval *interval = entry; const struct nft_array_lookup_ctx *ctx = pkey; int a, b; if (!interval->from) return 1; a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); if (!interval->to) b = -1; else b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); if (a >= 0 && b < 0) return 0; if (a < 0) return -1; return 1; } INDIRECT_CALLABLE_SCOPE const struct nft_set_ext * nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_array *array = rcu_dereference(priv->array); const struct nft_array_interval *interval; struct nft_array_lookup_ctx ctx = { .key = key, .klen = set->klen, }; if (!array) return NULL; interval = bsearch(&ctx, array->intervals, array->num_intervals, sizeof(struct nft_array_interval), nft_array_lookup_cmp); if (!interval || nft_set_elem_expired(interval->from)) return NULL; return interval->from; } struct nft_array_get_ctx { const u32 *key; unsigned int flags; u32 klen; }; static int nft_array_get_cmp(const void *pkey, const void *entry) { const struct nft_array_interval *interval = entry; const struct nft_array_get_ctx *ctx = pkey; int a, b; if (!interval->from) return 1; a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); if (!interval->to) b = -1; else b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); if (a >= 0) { if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0) return 0; else if (b < 0) return 0; } if (a < 0) return -1; return 1; } static struct nft_elem_priv * nft_rbtree_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_array *array = rcu_dereference(priv->array); const struct nft_array_interval *interval; struct nft_array_get_ctx ctx = { .key = (const u32 *)&elem->key.val, .flags = flags, .klen = set->klen, }; struct nft_rbtree_elem *rbe; if (!array) return ERR_PTR(-ENOENT); interval = bsearch(&ctx, array->intervals, array->num_intervals, sizeof(struct nft_array_interval), nft_array_get_cmp); if (!interval || nft_set_elem_expired(interval->from)) return ERR_PTR(-ENOENT); if (flags & NFT_SET_ELEM_INTERVAL_END) rbe = container_of(interval->to, struct nft_rbtree_elem, ext); else rbe = container_of(interval->from, struct nft_rbtree_elem, ext); return &rbe->priv; } static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { lockdep_assert_held_write(&priv->lock); nft_setelem_data_deactivate(net, set, &rbe->priv); rb_erase(&rbe->node, &priv->root); /* collected later on in commit callback */ list_add(&rbe->list, &priv->expired); } static const struct nft_rbtree_elem * nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they * are coupled with the interval start element. */ while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev) && nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); } rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_elem_move(net, set, priv, rbe_prev); } nft_rbtree_gc_elem_move(net, set, priv, rbe); return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, struct nft_rbtree_elem *rbe, struct rb_node *first) { struct nft_rbtree_elem *first_elem; first_elem = rb_entry(first, struct nft_rbtree_elem, node); /* this element is closest to where the new element is to be inserted: * update the first element for the node list path. */ if (nft_rbtree_cmp(set, rbe, first_elem) < 0) return true; return false; } /* Only for anonymous sets which do not allow updates, all element are active. */ static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe) { struct rb_node *node; node = rb_prev(&rbe->node); if (!node) return NULL; return rb_entry(node, struct nft_rbtree_elem, node); } static struct nft_rbtree_elem * __nft_rbtree_next_active(struct rb_node *node, u8 genmask) { struct nft_rbtree_elem *next_rbe; while (node) { next_rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&next_rbe->ext, genmask)) { node = rb_next(node); continue; } return next_rbe; } return NULL; } static struct nft_rbtree_elem * nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask) { return __nft_rbtree_next_active(rb_next(&rbe->node), genmask); } static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv, u64 tstamp) { if (priv->last_tstamp != tstamp) { priv->start_rbe_cookie = 0; priv->last_tstamp = tstamp; } } static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv, const struct nft_rbtree_elem *rbe) { priv->start_rbe_cookie = (unsigned long)rbe; } static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv, const struct nft_rbtree_elem *rbe) { return priv->start_rbe_cookie == (unsigned long)rbe; } static bool nft_rbtree_insert_same_interval(const struct net *net, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *next_rbe; if (!priv->start_rbe_cookie) return true; next_rbe = nft_rbtree_next_active(rbe, genmask); if (next_rbe) { /* Closest start element differs from last element added. */ if (nft_rbtree_interval_start(next_rbe) && nft_rbtree_cmp_start_cookie(priv, next_rbe)) { priv->start_rbe_cookie = 0; return true; } } priv->start_rbe_cookie = 0; return false; } static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_elem_priv **elem_priv, u64 tstamp) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the * first element to walk the ordered elements to find possible overlap. */ parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = nft_rbtree_cmp(set, rbe, new); if (d < 0) { p = &parent->rb_left; } else if (d > 0) { if (!first || nft_rbtree_update_first(set, rbe, first)) first = &rbe->node; p = &parent->rb_right; } else { if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; else p = &parent->rb_right; } } if (!first) first = rb_first(&priv->root); /* Detect overlap by going through the list of valid tree nodes. * Values stored in the tree are in reversed order, starting from * highest to lowest value. */ for (node = first; node != NULL; node = next) { next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&rbe->ext, genmask)) continue; /* perform garbage collection to avoid bogus overlap reports * but skip new elements in this transaction. */ if (__nft_set_elem_expired(&rbe->ext, tstamp) && nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; removed_end = nft_rbtree_gc_elem(set, priv, rbe); if (IS_ERR(removed_end)) return PTR_ERR(removed_end); if (removed_end == rbe_le || removed_end == rbe_ge) return -EAGAIN; continue; } d = nft_rbtree_cmp(set, rbe, new); if (d == 0) { /* Matching end element: no need to look for an * overlapping greater or equal element. */ if (nft_rbtree_interval_end(rbe)) { rbe_le = rbe; break; } /* first element that is greater or equal to key value. */ if (!rbe_ge) { rbe_ge = rbe; continue; } /* this is a closer more or equal element, update it. */ if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { rbe_ge = rbe; continue; } /* element is equal to key value, make sure flags are * the same, an existing more or equal start element * must not be replaced by more or equal end element. */ if ((nft_rbtree_interval_start(new) && nft_rbtree_interval_start(rbe_ge)) || (nft_rbtree_interval_end(new) && nft_rbtree_interval_end(rbe_ge))) { rbe_ge = rbe; continue; } } else if (d > 0) { /* annotate element greater than the new element. */ rbe_ge = rbe; continue; } else if (d < 0) { /* annotate element less than the new element. */ rbe_le = rbe; break; } } if (nft_rbtree_interval_null(set, new)) priv->start_rbe_cookie = 0; else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) priv->start_rbe_cookie = 0; /* - new start element matching existing start element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. */ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { *elem_priv = &rbe_ge->priv; nft_rbtree_set_start_cookie(priv, rbe_ge); return -EEXIST; } /* - new end element matching existing end element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. */ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { /* - ignore null interval, otherwise NLM_F_CREATE bogusly * reports EEXIST. */ if (nft_rbtree_interval_null(set, new)) return -ECANCELED; *elem_priv = &rbe_le->priv; /* - start and end element belong to the same interval. */ if (!nft_rbtree_insert_same_interval(net, priv, rbe_le)) return -ENOTEMPTY; return -EEXIST; } /* - new start element with existing closest, less or equal key value * being a start element: partial overlap, reported as -ENOTEMPTY. * Anonymous sets allow for two consecutive start element since they * are constant, but validate that this new start element does not * sit in between an existing start and end elements: partial overlap, * reported as -ENOTEMPTY. */ if (rbe_le && nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) { if (!nft_set_is_anonymous(set)) return -ENOTEMPTY; rbe_prev = nft_rbtree_prev_active(rbe_le); if (rbe_prev && nft_rbtree_interval_end(rbe_prev)) return -ENOTEMPTY; } /* - new end element with existing closest, less or equal key value * being a end element: partial overlap, reported as -ENOTEMPTY. */ if (rbe_le && nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; /* - new end element with existing closest, greater or equal key value * being an end element: partial overlap, reported as -ENOTEMPTY */ if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; /* Accepted element: pick insertion point depending on key value */ parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = nft_rbtree_cmp(set, rbe, new); if (d < 0) p = &parent->rb_left; else if (d > 0) p = &parent->rb_right; else if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; else p = &parent->rb_right; } rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; } static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals) { struct nft_array_interval *intervals; intervals = kvzalloc_objs(struct nft_array_interval, max_intervals, GFP_KERNEL_ACCOUNT); if (!intervals) return -ENOMEM; if (array->intervals) kvfree(array->intervals); array->intervals = intervals; array->max_intervals = max_intervals; return 0; } static struct nft_array *nft_array_alloc(u32 max_intervals) { struct nft_array *array; array = kzalloc_obj(*array, GFP_KERNEL_ACCOUNT); if (!array) return NULL; if (nft_array_intervals_alloc(array, max_intervals) < 0) { kfree(array); return NULL; } return array; } /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider * packed representation coming from userspace for anonymous sets too. */ static u32 nft_array_elems(const struct nft_set *set) { u32 nelems = atomic_read(&set->nelems) - set->ndeact; /* Adjacent intervals are represented with a single start element in * anonymous sets, use the current element counter as is. */ if (nft_set_is_anonymous(set)) return nelems; /* Add extra room for never matching interval at the beginning and open * interval at the end which only use a single element to represent it. * The conversion to array will compact intervals, this allows reduce * memory consumption. */ return (nelems / 2) + 2; } #define NFT_ARRAY_INITIAL_SIZE 1024 #define NFT_ARRAY_INITIAL_ANON_SIZE 16 #define NFT_ARRAY_INITIAL_ANON_THRESH (8192U / sizeof(struct nft_array_interval)) static int nft_array_may_resize(const struct nft_set *set, bool flush) { u32 initial_intervals, max_intervals, new_max_intervals, delta; u32 shrinked_max_intervals, nelems = nft_array_elems(set); struct nft_rbtree *priv = nft_set_priv(set); struct nft_array *array; if (nft_set_is_anonymous(set)) initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE; else initial_intervals = NFT_ARRAY_INITIAL_SIZE; if (priv->array_next) { max_intervals = priv->array_next->max_intervals; new_max_intervals = priv->array_next->max_intervals; } else { if (priv->array) { max_intervals = priv->array->max_intervals; new_max_intervals = priv->array->max_intervals; } else { max_intervals = 0; new_max_intervals = initial_intervals; } } if (nft_set_is_anonymous(set)) goto maybe_grow; if (flush) { /* Set flush just started, nelems still report elements.*/ nelems = 0; new_max_intervals = NFT_ARRAY_INITIAL_SIZE; goto realloc_array; } if (check_add_overflow(new_max_intervals, new_max_intervals, &shrinked_max_intervals)) return -EOVERFLOW; shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3); if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE && nelems < shrinked_max_intervals) { new_max_intervals = shrinked_max_intervals; goto realloc_array; } maybe_grow: if (nelems > new_max_intervals) { if (nft_set_is_anonymous(set) && new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) { new_max_intervals <<= 1; } else { delta = new_max_intervals >> 1; if (check_add_overflow(new_max_intervals, delta, &new_max_intervals)) return -EOVERFLOW; } } realloc_array: if (WARN_ON_ONCE(nelems > new_max_intervals)) return -ENOMEM; if (priv->array_next) { if (max_intervals == new_max_intervals) return 0; if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) return -ENOMEM; } else { array = nft_array_alloc(new_max_intervals); if (!array) return -ENOMEM; priv->array_next = array; } return 0; } static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); u64 tstamp = nft_net_tstamp(net); int err; nft_rbtree_maybe_reset_start_cookie(priv, tstamp); if (nft_array_may_resize(set, false) < 0) return -ENOMEM; do { if (fatal_signal_pending(current)) return -EINTR; cond_resched(); write_lock_bh(&priv->lock); err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp); write_unlock_bh(&priv->lock); } while (err == -EAGAIN); return err; } static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { write_lock_bh(&priv->lock); rb_erase(&rbe->node, &priv->root); write_unlock_bh(&priv->lock); } static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); struct nft_rbtree *priv = nft_set_priv(set); nft_rbtree_erase(priv, rbe); } static void nft_rbtree_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_clear(net, &rbe->ext); } static struct nft_rbtree_elem * nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask) { struct nft_rbtree_elem *next_rbe; struct rb_node *node; node = rb_next(&rbe->node); if (node) { next_rbe = rb_entry(node, struct nft_rbtree_elem, node); if (nft_rbtree_interval_start(next_rbe) && !nft_set_elem_active(&next_rbe->ext, genmask)) return next_rbe; } return NULL; } static bool nft_rbtree_deactivate_same_interval(const struct net *net, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *next_rbe; if (!priv->start_rbe_cookie) return true; next_rbe = nft_rbtree_next_inactive(rbe, genmask); if (next_rbe) { /* Closest start element differs from last element added. */ if (nft_rbtree_interval_start(next_rbe) && nft_rbtree_cmp_start_cookie(priv, next_rbe)) { priv->start_rbe_cookie = 0; return true; } } priv->start_rbe_cookie = 0; return false; } static void nft_rbtree_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &rbe->ext); } static struct nft_elem_priv * nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); u64 tstamp = nft_net_tstamp(net); int d; nft_rbtree_maybe_reset_start_cookie(priv, tstamp); if (nft_rbtree_interval_start(this) || nft_rbtree_interval_null(set, this)) priv->start_rbe_cookie = 0; if (nft_array_may_resize(set, false) < 0) return NULL; while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, set->klen); if (d < 0) parent = parent->rb_left; else if (d > 0) parent = parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(this)) { parent = parent->rb_left; continue; } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; } if (nft_rbtree_interval_start(rbe)) nft_rbtree_set_start_cookie(priv, rbe); else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe)) return NULL; nft_rbtree_flush(net, set, &rbe->priv); return &rbe->priv; } } return NULL; } static void nft_rbtree_do_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); if (iter->count < iter->skip) goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) return; cont: iter->count++; } } static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); switch (iter->type) { case NFT_ITER_UPDATE_CLONE: if (nft_array_may_resize(set, true) < 0) { iter->err = -ENOMEM; break; } fallthrough; case NFT_ITER_UPDATE: lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); nft_rbtree_do_walk(ctx, set, iter); break; case NFT_ITER_READ: read_lock_bh(&priv->lock); nft_rbtree_do_walk(ctx, set, iter); read_unlock_bh(&priv->lock); break; default: iter->err = -EINVAL; WARN_ON_ONCE(1); break; } } static void nft_rbtree_gc_scan(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct net *net = read_pnet(&set->net); u64 tstamp = nft_net_tstamp(net); struct rb_node *node, *next; for (node = rb_first(&priv->root); node ; node = next) { next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is * always visited before the start element. */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; /* end element needs to be removed first, it has * no timeout extension. */ write_lock_bh(&priv->lock); if (rbe_end) { nft_rbtree_gc_elem_move(net, set, priv, rbe_end); rbe_end = NULL; } nft_rbtree_gc_elem_move(net, set, priv, rbe); write_unlock_bh(&priv->lock); } priv->last_gc = jiffies; } static void nft_rbtree_gc_queue(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end; struct nft_trans_gc *gc; if (list_empty(&priv->expired)) return; gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) return; list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) { list_del(&rbe->list); nft_trans_gc_elem_add(gc, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; } gc = nft_trans_gc_catchall_sync(gc); nft_trans_gc_queue_sync_done(gc); } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { return sizeof(struct nft_rbtree); } static int nft_rbtree_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_rbtree *priv = nft_set_priv(set); BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); rwlock_init(&priv->lock); priv->root = RB_ROOT; INIT_LIST_HEAD(&priv->expired); priv->array = NULL; priv->array_next = NULL; return 0; } static void __nft_array_free(struct nft_array *array) { kvfree(array->intervals); kfree(array); } static void nft_rbtree_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *next; struct nft_array *array; struct rb_node *node; list_for_each_entry_safe(rbe, next, &priv->expired, list) { list_del(&rbe->list); nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } array = rcu_dereference_protected(priv->array, true); if (array) __nft_array_free(array); if (priv->array_next) __nft_array_free(priv->array_next); } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (desc->field_count > 1) return false; if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); else est->size = ~0; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; return true; } static void nft_array_free_rcu(struct rcu_head *rcu_head) { struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head); __nft_array_free(array); } static void nft_rbtree_commit(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *prev_rbe; struct nft_array *old; u32 num_intervals = 0; struct rb_node *node; /* No changes, skip, eg. elements updates only. */ if (!priv->array_next) return; /* GC can be performed if the binary search blob is going * to be rebuilt. It has to be done in two phases: first * scan tree and move all expired elements to the expired * list. * * Then, after blob has been re-built and published to other * CPUs, queue collected entries for freeing. */ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) nft_rbtree_gc_scan(set); /* Reverse walk to create an array from smaller to largest interval. */ node = rb_last(&priv->root); if (node) prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); else prev_rbe = NULL; while (prev_rbe) { rbe = prev_rbe; if (nft_rbtree_interval_start(rbe)) priv->array_next->intervals[num_intervals].from = &rbe->ext; else if (nft_rbtree_interval_end(rbe)) priv->array_next->intervals[num_intervals++].to = &rbe->ext; if (num_intervals >= priv->array_next->max_intervals) { pr_warn_once("malformed interval set from userspace?"); goto err_out; } node = rb_prev(node); if (!node) break; prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); /* For anonymous sets, when adjacent ranges are found, * the end element is not added to the set to pack the set * representation. Use next start element to complete this * interval. */ if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_start(prev_rbe) && priv->array_next->intervals[num_intervals].from) priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext; if (num_intervals >= priv->array_next->max_intervals) { pr_warn_once("malformed interval set from userspace?"); goto err_out; } } if (priv->array_next->intervals[num_intervals].from) num_intervals++; err_out: priv->array_next->num_intervals = num_intervals; old = rcu_replace_pointer(priv->array, priv->array_next, lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex)); priv->array_next = NULL; if (old) call_rcu(&old->rcu_head, nft_array_free_rcu); /* New blob is public, queue collected entries for freeing. * call_rcu ensures elements stay around until readers are done. */ nft_rbtree_gc_queue(set); } static void nft_rbtree_abort(const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_array *array_next; if (!priv->array_next) return; array_next = priv->array_next; priv->array_next = NULL; __nft_array_free(array_next); } static void nft_rbtree_gc_init(const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); priv->last_gc = jiffies; } /* rbtree stores ranges as singleton elements, each range is composed of two * elements ... */ static u32 nft_rbtree_ksize(u32 size) { return size * 2; } /* ... hide this detail to userspace. */ static u32 nft_rbtree_usize(u32 size) { if (!size) return 0; return size / 2; } static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; const void *key; node = rb_last(&priv->root); if (!node) return 0; rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_rbtree_interval_end(rbe)) return 0; key = nft_set_ext_key(&rbe->ext); if (memchr(key, 1, set->klen)) return 0; /* this is the all-zero no-match element. */ return 1; } const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .privsize = nft_rbtree_privsize, .elemsize = offsetof(struct nft_rbtree_elem, ext), .estimate = nft_rbtree_estimate, .init = nft_rbtree_init, .destroy = nft_rbtree_destroy, .insert = nft_rbtree_insert, .remove = nft_rbtree_remove, .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .commit = nft_rbtree_commit, .abort = nft_rbtree_abort, .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, .ksize = nft_rbtree_ksize, .usize = nft_rbtree_usize, .adjust_maxsize = nft_rbtree_adjust_maxsize, }, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H #include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> #include <linux/userfaultfd_k.h> #include <linux/bits.h> struct swap_iocb; /* inode in-kernel data */ #ifdef CONFIG_TMPFS_QUOTA #define SHMEM_MAXQUOTAS 2 #endif /* Suppress pre-accounting of the entire object size. */ #define SHMEM_F_NORESERVE BIT(0) /* Disallow swapping. */ #define SHMEM_F_LOCKED BIT(1) /* * Disallow growing, shrinking, or hole punching in the inode. Combined with * folio pinning, makes sure the inode's mapping stays fixed. * * In some ways similar to F_SEAL_GROW | F_SEAL_SHRINK, but can be removed and * isn't directly visible to userspace. */ #define SHMEM_F_MAPPING_FROZEN BIT(2) struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ union { struct offset_ctx dir_offsets; /* stable directory offsets */ struct { struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ }; }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ #ifdef CONFIG_TMPFS_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif struct inode vfs_inode; }; #define SHMEM_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL) #define SHMEM_FL_USER_MODIFIABLE \ (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) struct shmem_quota_limits { qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */ qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */ qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */ }; struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_ispace; /* How much ispace left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ bool full_inums; /* If i_ino should be uint or ino_t */ bool noswap; /* ignores VM reclaim / swap requests */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) { return container_of(inode, struct shmem_inode_info, vfs_inode); } /* * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags); struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t vma_flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, vma_flags_t flags); int shmem_zero_setup(struct vm_area_struct *vma); int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM bool shmem_mapping(const struct address_space *mapping); #else static inline bool shmem_mapping(const struct address_space *mapping) { return false; } #endif /* CONFIG_SHMEM */ void shmem_unlock_mapping(struct address_space *mapping); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list); void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force); bool shmem_hpage_pmd_enabled(void); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { return 0; } static inline bool shmem_hpage_pmd_enabled(void) { return false; } #endif #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern void shmem_uncharge(struct inode *inode, long pages); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } static inline void shmem_uncharge(struct inode *inode, long pages) { } #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); /* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); static inline struct folio *shmem_read_folio(struct address_space *mapping, pgoff_t index) { return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { return shmem_read_mapping_page_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline bool shmem_file(struct file *file) { if (!IS_ENABLED(CONFIG_SHMEM)) return false; if (!file || !file->f_mapping) return false; return shmem_mapping(file->f_mapping); } /* Must be called with inode lock taken exclusive. */ static inline void shmem_freeze(struct inode *inode, bool freeze) { if (freeze) SHMEM_I(inode)->flags |= SHMEM_F_MAPPING_FROZEN; else SHMEM_I(inode)->flags &= ~SHMEM_F_MAPPING_FROZEN; } /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: * which split_huge_page() must therefore not delete. This use of a single * "fallocend" per inode errs on the side of not deleting a reservation when * in doubt: there are plenty of cases when it preserves unreserved pages. */ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) { return max(eof, SHMEM_I(inode)->fallocend); } extern bool shmem_charge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ src_addr, flags, foliop) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that * as a limit */ #define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ #define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL #ifdef CONFIG_TMPFS_QUOTA extern const struct dquot_operations shmem_quota_operations; extern struct quota_format_type shmem_quota_format; #endif /* CONFIG_TMPFS_QUOTA */ #endif
1256 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bpf_test_run #if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_TEST_RUN_H #include <linux/tracepoint.h> TRACE_EVENT(bpf_trigger_tp, TP_PROTO(int nonce), TP_ARGS(nonce), TP_STRUCT__entry( __field(int, nonce) ), TP_fast_assign( __entry->nonce = nonce; ), TP_printk("nonce %d", __entry->nonce) ); DECLARE_EVENT_CLASS(bpf_test_finish, TP_PROTO(int *err), TP_ARGS(err), TP_STRUCT__entry( __field(int, err) ), TP_fast_assign( __entry->err = *err; ), TP_printk("bpf_test_finish with err=%d", __entry->err) ); #ifdef DEFINE_EVENT_WRITABLE #undef BPF_TEST_RUN_DEFINE_EVENT #define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ PARAMS(args), size) #else #undef BPF_TEST_RUN_DEFINE_EVENT #define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) #endif BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish, TP_PROTO(int *err), TP_ARGS(err), sizeof(int) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
28 61 61 61 520 61 327 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _PROTO_MEMORY_H #define _PROTO_MEMORY_H #include <net/sock.h> #include <net/hotdata.h> /* 1 MB per cpu, in page units */ #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) static inline bool sk_has_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure != NULL; } static inline bool proto_memory_pressure(const struct proto *prot) { if (!prot->memory_pressure) return false; return !!READ_ONCE(*prot->memory_pressure); } static inline bool sk_under_global_memory_pressure(const struct sock *sk) { return proto_memory_pressure(sk->sk_prot); } static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) return false; if (mem_cgroup_sk_enabled(sk) && mem_cgroup_sk_under_memory_pressure(sk)) return true; if (sk->sk_bypass_prot_mem) return false; return !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline long proto_memory_allocated(const struct proto *prot) { return max(0L, atomic_long_read(prot->memory_allocated)); } static inline long sk_memory_allocated(const struct sock *sk) { return proto_memory_allocated(sk->sk_prot); } static inline void proto_memory_pcpu_drain(struct proto *proto) { int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); if (val) atomic_long_add(val, proto->memory_allocated); } static inline void sk_memory_allocated_add(const struct sock *sk, int val) { struct proto *proto = sk->sk_prot; val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) proto_memory_pcpu_drain(proto); } static inline void sk_memory_allocated_sub(const struct sock *sk, int val) { struct proto *proto = sk->sk_prot; val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) proto_memory_pcpu_drain(proto); } #endif /* _PROTO_MEMORY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: GPL-2.0-only #ifndef _NFT_SET_PIPAPO_H #include <linux/log2.h> #include <net/ipv6.h> /* For the maximum length of a field */ /* Count of concatenated fields depends on count of 32-bit nftables registers */ #define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT /* Restrict usage to multiple fields, make sure rbtree is used otherwise */ #define NFT_PIPAPO_MIN_FIELDS 2 /* Largest supported field size */ #define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) #define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) /* Bits to be grouped together in table buckets depending on set size */ #define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET #define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8 #define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4 #define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \ BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \ (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4)) #define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb) /* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the * small group width, and switch to the big group width if the table gets * smaller than NFT_PIPAPO_LT_SIZE_LOW. * * Picking 2MiB as threshold (for a single table) avoids as much as possible * crossing page boundaries on most architectures (x86-64 and MIPS huge pages, * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which * keeps performance nice in case kvmalloc() gives us non-contiguous areas. */ #define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21) #define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16) #define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD #define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \ NFT_PIPAPO_LT_SIZE_HYSTERESIS /* Fields are padded to 32 bits in input registers */ #define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \ (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32))) #define NFT_PIPAPO_GROUPS_PADDING(f) \ (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \ NFT_PIPAPO_GROUPS_PER_BYTE(f)) /* Number of buckets given by 2 ^ n, with n bucket bits */ #define NFT_PIPAPO_BUCKETS(bb) (1 << (bb)) /* Each n-bit range maps to up to n * 2 rules */ #define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) /* Use the rest of mapping table buckets for rule indices, but it makes no sense * to exceed 32 bits */ #if BITS_PER_LONG == 64 #define NFT_PIPAPO_MAP_TOBITS 32 #else #define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) #endif /* ...which gives us the highest allowed index for a rule */ #define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ - (1UL << NFT_PIPAPO_MAP_NBITS)) /* Definitions for vectorised implementations */ #ifdef NFT_PIPAPO_ALIGN #define NFT_PIPAPO_ALIGN_HEADROOM \ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) #define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) #else #define NFT_PIPAPO_ALIGN_HEADROOM 0 #define NFT_PIPAPO_LT_ALIGN(lt) (lt) #endif /* NFT_PIPAPO_ALIGN */ #define nft_pipapo_for_each_field(field, index, match) \ for ((field) = (match)->f, (index) = 0; \ (index) < (match)->field_count; \ (index)++, (field)++) /** * union nft_pipapo_map_bucket - Bucket of mapping table * @to: First rule number (in next field) this rule maps to * @n: Number of rules (in next field) this rule maps to * @e: If there's no next field, pointer to element this rule maps to */ union nft_pipapo_map_bucket { struct { #if BITS_PER_LONG == 64 static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); u32 to; static_assert(NFT_PIPAPO_MAP_NBITS <= 32); u32 n; #else unsigned long to:NFT_PIPAPO_MAP_TOBITS; unsigned long n:NFT_PIPAPO_MAP_NBITS; #endif }; struct nft_pipapo_elem *e; }; /** * struct nft_pipapo_field - Lookup, mapping tables and related data for a field * @rules: Number of inserted rules * @bsize: Size of each bucket in lookup table, in longs * @rules_alloc: Number of allocated rules, always >= rules * @groups: Amount of bit groups * @bb: Number of bits grouped together in lookup table buckets * @lt: Lookup table: 'groups' rows of buckets * @mt: Mapping table: one bucket per rule */ struct nft_pipapo_field { unsigned int rules; unsigned int bsize; unsigned int rules_alloc; u8 groups; u8 bb; unsigned long *lt; union nft_pipapo_map_bucket *mt; }; /** * struct nft_pipapo_scratch - percpu data used for lookup and matching * @bh_lock: PREEMPT_RT local spinlock * @map_index: Current working bitmap index, toggled between field matches * @__map: store partial matching results during lookup */ struct nft_pipapo_scratch { local_lock_t bh_lock; u8 map_index; unsigned long __map[]; }; /** * struct nft_pipapo_match - Data used for lookup and matching * @field_count: Amount of fields in set * @bsize_max: Maximum lookup table bucket size of all fields, in longs * @scratch: Preallocated per-CPU maps for partial matching results * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { u8 field_count; unsigned int bsize_max; struct nft_pipapo_scratch * __percpu *scratch; struct rcu_head rcu; struct nft_pipapo_field f[] __counted_by(field_count); }; /** * struct nft_pipapo - Representation of a set * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding * @last_gc: Timestamp of last garbage collection run, jiffies * @gc_head: list of nft_trans_gc to queue up for mem reclaim */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; unsigned long last_gc; struct list_head gc_head; }; struct nft_pipapo_elem; /** * struct nft_pipapo_elem - API-facing representation of single set element * @priv: element placeholder * @ext: nftables API extensions */ struct nft_pipapo_elem { struct nft_elem_priv priv; struct nft_set_ext ext; }; int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, unsigned long *dst, const union nft_pipapo_map_bucket *mt, bool match_only); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets * @f: Field including lookup table * @dst: Area to store result * @data: Input data selecting table buckets */ static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); int group; for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) { u8 v; v = *data >> 4; __bitmap_and(dst, dst, lt + v * f->bsize, f->bsize * BITS_PER_LONG); lt += f->bsize * NFT_PIPAPO_BUCKETS(4); v = *data & 0x0f; __bitmap_and(dst, dst, lt + v * f->bsize, f->bsize * BITS_PER_LONG); lt += f->bsize * NFT_PIPAPO_BUCKETS(4); } } /** * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets * @f: Field including lookup table * @dst: Area to store result * @data: Input data selecting table buckets */ static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); int group; for (group = 0; group < f->groups; group++, data++) { __bitmap_and(dst, dst, lt + *data * f->bsize, f->bsize * BITS_PER_LONG); lt += f->bsize * NFT_PIPAPO_BUCKETS(8); } } /** * pipapo_estimate_size() - Estimate worst-case for set size * @desc: Set description, element count and field description used here * * The size for this set type can vary dramatically, as it depends on the number * of rules (composing netmasks) the entries expand to. We compute the worst * case here. * * In general, for a non-ranged entry or a single composing netmask, we need * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that * is, each input bit needs four bits of matching data), plus a bucket in the * mapping table for each field. * * Return: worst-case set size in bytes, 0 on any overflow */ static u64 pipapo_estimate_size(const struct nft_set_desc *desc) { unsigned long entry_size; u64 size; int i; for (i = 0, entry_size = 0; i < desc->field_count; i++) { unsigned long rules; if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) return 0; /* Worst-case ranges for each concatenated field: each n-bit * field can expand to up to n * 2 rules in each bucket, and * each rule also needs a mapping bucket. */ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; entry_size += rules * NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) / BITS_PER_BYTE; entry_size += rules * sizeof(union nft_pipapo_map_bucket); } /* Rules in lookup and mapping tables are needed for each entry */ size = desc->size * entry_size; if (size && div_u64(size, desc->size) != entry_size) return 0; size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2; size += sizeof(struct nft_pipapo_field) * desc->field_count; return size; } /** * pipapo_resmap_init() - Initialise result map before first use * @m: Matching data, including mapping table * @res_map: Result map * * Initialize all bits covered by the first field to one, so that after * the first step, only the matching bits of the first bit group remain. * * If other fields have a large bitmap, set remainder of res_map to 0. */ static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map) { const struct nft_pipapo_field *f = m->f; int i; for (i = 0; i < f->bsize; i++) res_map[i] = ULONG_MAX; for (i = f->bsize; i < m->bsize_max; i++) res_map[i] = 0ul; } #endif /* _NFT_SET_PIPAPO_H */
11 11 9 2 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <net/psp.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn, enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; /* Instead of dropping immediately, wait to see what value is * returned. We will accept a non psp-encapsulated syn in the * case where TCP_TW_SYN is returned. */ psp_drop = psp_twsk_rx_policy_check(tw, skb); tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ if (psp_drop) goto out_put; /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, rcv_nxt, rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, rcv_nxt); if (tmp_opt.saw_tstamp) { u64 ts = tcp_clock_ms(); WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (psp_drop) goto out_put; if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); WRITE_ONCE(tcptw->tw_ts_recent_stamp, ktime_get_seconds()); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; *tw_isn = isn; return TCP_TW_SYN; } if (psp_drop) goto out_put; if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { #ifdef CONFIG_TCP_MD5SIG const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ tcptw->tw_md5_key = NULL; if (!static_branch_unlikely(&tcp_md5_needed.key)) return; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (!tcptw->tw_md5_key) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; } return; out_free: WARN_ON_ONCE(1); kfree(tcptw->tw_md5_key); tcptw->tw_md5_key = NULL; #endif } /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_timewait_sock *tw; tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; /* refreshed when we enter true TIME-WAIT state */ tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; #endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif tcp_time_wait_init(sk, tcptw); tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) { kfree(twsk->tw_md5_key); static_branch_slow_dec_deferred(&tcp_md5_needed); } } #endif tcp_ao_destroy_sock(sk, true); psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } static void tcp_ecn_openreq_child(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb) { const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); if (treq->accecn_ok) { tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); tp->saw_accecn_opt = treq->saw_accecn_opt; if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND) tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND); if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV) tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk)) tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); else tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->total_rto = req->num_timeout; newtp->undo_marker = treq->snt_isn; if (newtp->tcp_usec_ts) { newtp->retrans_stamp = treq->snt_synack; newtp->total_rto_time = (u32)(tcp_clock_us() - newtp->retrans_stamp) / USEC_PER_MSEC; } else { newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto_time = tcp_clock_ms() - newtp->retrans_stamp; } newtp->total_rto_recoveries = 1; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; if (tcp_rsk_used_ao(req)) { struct tcp_ao_key *ao_key; ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); if (ao_key) newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); } #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen, enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool tsecr_reject = false; bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) { if (inet_rsk(req)->tstamp_ok && !fastopen) tsecr_reject = !between(tmp_opt.rcv_tsecr, tcp_rsk(req)->snt_tsval_first, READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - tcp_reqsk_timeout(req) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) { if (tcp_rsk(req)->accecn_ok) { u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; tcp_rsk(req)->syn_ect_rcv = ect_rcv; if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV; } if (!tcp_rtx_synack(sk, req)) { unsigned long expires = jiffies; if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; expires += tcp_reqsk_timeout(req); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* RFC793: "first check sequence number". */ if (paws_reject || tsecr_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); } else if (tsecr_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); } else { SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); } return NULL; } /* In sequence, PAWS is OK. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); tcp_rsk(req)->saw_accecn_opt = saw_opt; if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; tcp_rsk(req)->accecn_fail_mode |= fail_mode; } } /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req, NULL); if (!child) goto listen_overflow; if (own_req && tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) READ_ONCE(parent->sk_data_ready)(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return reason; } EXPORT_IPV6_MOD(tcp_child_process);
667 668 666 519 667 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/symlink.c - kernfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/namei.h> #include "kernfs-internal.h" /** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { struct kernfs_node *kn; int error; kuid_t uid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { uid = target->iattr->ia_uid; gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); if (!error) return kn; kernfs_put(kn); return ERR_PTR(error); } static int kernfs_get_target_path(struct kernfs_node *parent, struct kernfs_node *target, char *path) { struct kernfs_node *base, *kn; char *s = path; int len = 0; /* go up to the root, stop at the base */ base = parent; while (kernfs_parent(base)) { kn = kernfs_parent(target); while (kernfs_parent(kn) && base != kn) kn = kernfs_parent(kn); if (base == kn) break; if ((s - path) + 3 >= PATH_MAX) return -ENAMETOOLONG; strcpy(s, "../"); s += 3; base = kernfs_parent(base); } /* determine end of target string for reverse fillup */ kn = target; while (kernfs_parent(kn) && kn != base) { len += strlen(kernfs_rcu_name(kn)) + 1; kn = kernfs_parent(kn); } /* check limits */ if (len < 2) return -EINVAL; len--; if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ kn = target; while (kernfs_parent(kn) && kn != base) { const char *name = kernfs_rcu_name(kn); int slen = strlen(name); len -= slen; memcpy(s + len, name, slen); if (len) s[--len] = '/'; kn = kernfs_parent(kn); } return 0; } static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent; struct kernfs_node *target = kn->symlink.target_kn; struct kernfs_root *root = kernfs_root(kn); int error; down_read(&root->kernfs_rwsem); parent = kernfs_parent(kn); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); return error; } static const char *kernfs_iop_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *body; int error; if (!dentry) return ERR_PTR(-ECHILD); body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); } set_delayed_call(done, kfree_link, body); return body; } const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, };
4040 1110 1018 896 196 2322 476 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit.h -- Auditing support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> */ #ifndef _LINUX_AUDIT_H_ #define _LINUX_AUDIT_H_ #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/fanotify.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { uid_t uid; pid_t pid; char ctx[]; }; struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; struct path; struct linux_binprm; struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; struct lsm_id; struct lsm_prop; struct audit_krule { u32 pflags; u32 flags; u32 listnr; u32 action; u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct audit_tree *tree; /* associated watched tree */ struct audit_fsnotify_mark *exe; struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ struct list_head list; /* for AUDIT_LIST* purposes only */ u64 prio; }; /* Flag to indicate legacy AUDIT_LOGINUID unset usage */ #define AUDIT_LOGINUID_LEGACY 0x1 struct audit_field { u32 type; union { u32 val; kuid_t uid; kgid_t gid; struct { char *lsm_str; void *lsm_rule; }; }; u32 op; }; enum audit_ntp_type { AUDIT_NTP_OFFSET, AUDIT_NTP_FREQ, AUDIT_NTP_STATUS, AUDIT_NTP_TAI, AUDIT_NTP_TICK, AUDIT_NTP_ADJUST, AUDIT_NTP_NVALS /* count */ }; #ifdef CONFIG_AUDITSYSCALL struct audit_ntp_val { long long oldval, newval; }; struct audit_ntp_data { struct audit_ntp_val vals[AUDIT_NTP_NVALS]; }; #else struct audit_ntp_data {}; #endif enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, AUDIT_NFT_OP_TABLE_REGISTER, AUDIT_NFT_OP_TABLE_UNREGISTER, AUDIT_NFT_OP_CHAIN_REGISTER, AUDIT_NFT_OP_CHAIN_UNREGISTER, AUDIT_NFT_OP_RULE_REGISTER, AUDIT_NFT_OP_RULE_UNREGISTER, AUDIT_NFT_OP_SET_REGISTER, AUDIT_NFT_OP_SET_UNREGISTER, AUDIT_NFT_OP_SETELEM_REGISTER, AUDIT_NFT_OP_SETELEM_UNREGISTER, AUDIT_NFT_OP_GEN_REGISTER, AUDIT_NFT_OP_OBJ_REGISTER, AUDIT_NFT_OP_OBJ_UNREGISTER, AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_SETELEM_RESET, AUDIT_NFT_OP_RULE_RESET, AUDIT_NFT_OP_INVALID, }; extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ #define AUDIT_TYPE_NORMAL 1 /* a "normal" audit record */ #define AUDIT_TYPE_PARENT 2 /* a parent audit record */ #define AUDIT_TYPE_CHILD_DELETE 3 /* a child being deleted */ #define AUDIT_TYPE_CHILD_CREATE 4 /* a child being created */ /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 /* bit values for ->signal->audit_tty */ #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) #define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ extern __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern bool audit_string_contains_control(const char *string, size_t len); extern void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); extern void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n); extern void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n); extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_log_nf_skb(struct audit_buffer *ab, const struct sk_buff *skb, u8 nfproto); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return tsk->loginuid; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return tsk->sessionid; } extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { } static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { return NULL; } static inline __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } static inline void audit_log_end(struct audit_buffer *ab) { } static inline void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { } static inline void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n) { } static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n) { } static inline void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { } static inline void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } static inline int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { return 0; } static inline int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { return 0; } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } static inline void audit_log_task_info(struct audit_buffer *ab) { } static inline int audit_log_nf_skb(struct audit_buffer *ab, const struct sk_buff *skb, u8 nfproto) { return 0; } static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return INVALID_UID; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return AUDIT_SID_UNSET; } #define audit_enabled AUDIT_OFF static inline int audit_signal_info(int sig, struct task_struct *t) { return 0; } static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) { } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else #define audit_is_compat(arch) false #endif #define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ #define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ #define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ #ifdef CONFIG_AUDITSYSCALL #include <asm/syscall.h> /* for syscall_get_arch() */ /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_uring_entry(u8 op); extern void __audit_uring_exit(int success, long code); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void audit_seccomp(unsigned long syscall, long signr, int code); extern void audit_seccomp_actions_logged(const char *names, const char *old_names, int res); extern void __audit_ptrace(struct task_struct *t); static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { task->audit_context = ctx; } static inline struct audit_context *audit_context(void) { return current->audit_context; } static inline bool audit_dummy_context(void) { void *p = audit_context(); return !p || *(int *)p; } static inline void audit_free(struct task_struct *task) { if (unlikely(task->audit_context)) __audit_free(task); } static inline void audit_uring_entry(u8 op) { /* * We intentionally check audit_context() before audit_enabled as most * Linux systems (as of ~2021) rely on systemd which forces audit to * be enabled regardless of the user's audit configuration. */ if (unlikely(audit_context() && audit_enabled)) __audit_uring_entry(op); } static inline void audit_uring_exit(int success, long code) { if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { if (unlikely(audit_context())) __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(audit_context())) { int success = is_syscall_success(pt_regs); long return_code = regs_return_value(pt_regs); __audit_syscall_exit(success, return_code); } } static inline void audit_getname(struct filename *name) { if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); } static inline void audit_file(struct file *file) { if (unlikely(!audit_dummy_context())) __audit_file(file); } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) __audit_inode_child(parent, dentry, type); } void audit_core_dumps(long signr); static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) __audit_ptrace(t); } /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(const char *name); extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { if (unlikely(!audit_dummy_context())) __audit_ipc_obj(ipcp); } static inline void audit_fd_pair(int fd1, int fd2) { if (unlikely(!audit_dummy_context())) __audit_fd_pair(fd1, fd2); } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { if (unlikely(!audit_dummy_context())) return __audit_socketcall(nargs, args); return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { unsigned long a[AUDITSC_ARGS]; int i; if (audit_dummy_context()) return 0; for (i = 0; i < nargs; i++) a[i] = (unsigned long)args[i]; return __audit_socketcall(nargs, a); } static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) return __audit_sockaddr(len, addr); return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) __audit_mq_open(oflag, mode, attr); } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { if (unlikely(!audit_dummy_context())) __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout); } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { if (unlikely(!audit_dummy_context())) __audit_mq_notify(mqdes, notification); } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { if (unlikely(!audit_dummy_context())) __audit_mq_getsetattr(mqdes, mqstat); } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) return __audit_log_bprm_fcaps(bprm, new, old); return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) __audit_log_capset(new, old); } static inline void audit_mmap_fd(int fd, int flags) { if (unlikely(!audit_dummy_context())) __audit_mmap_fd(fd, flags); } static inline void audit_openat2_how(struct open_how *how) { if (unlikely(!audit_dummy_context())) __audit_openat2_how(how); } static inline void audit_log_kern_module(const char *name) { if (!audit_dummy_context()) __audit_log_kern_module(name); } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (audit_enabled) __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) { /* ignore no-op events */ if (offset.tv_sec == 0 && offset.tv_nsec == 0) return; if (!audit_dummy_context()) __audit_tk_injoffset(offset); } static inline void audit_ntp_init(struct audit_ntp_data *ad) { memset(ad, 0, sizeof(*ad)); } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].oldval = val; } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].newval = val; } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { if (!audit_dummy_context()) __audit_ntp_log(ad); } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ static inline int audit_alloc(struct task_struct *task) { return 0; } static inline void audit_free(struct task_struct *task) { } static inline void audit_uring_entry(u8 op) { } static inline void audit_uring_exit(int success, long code) { } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { } static inline void audit_syscall_exit(void *pt_regs) { } static inline bool audit_dummy_context(void) { return true; } static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { } static inline struct audit_context *audit_context(void) { return NULL; } static inline void audit_getname(struct filename *name) { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { } static inline void audit_file(struct file *file) { } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } static inline void audit_core_dumps(long signr) { } static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } static inline void audit_bprm(struct linux_binprm *bprm) { } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { return 0; } static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) { return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { } static inline void audit_mmap_fd(int fd, int flags) { } static inline void audit_openat2_how(struct open_how *how) { } static inline void audit_log_kern_module(const char *name) { } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) { } static inline void audit_ntp_init(struct audit_ntp_data *ad) { } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { } static inline void audit_ptrace(struct task_struct *t) { } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ static inline bool audit_loginuid_set(struct task_struct *tsk) { return uid_valid(audit_get_loginuid(tsk)); } #endif
19 19 19 19 19 19 19 18 18 18 18 18 1 1 18 18 2 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/plist.c * * Descending-priority-sorted double-linked list * * (C) 2002-2003 Intel Corp * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. * * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker <dwalker@mvista.com> * * (C) 2005 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * * Simplifications of the original code by * Oleg Nesterov <oleg@tv-sign.ru> * * Based on simple lists (include/linux/list.h). * * This file contains the add / del functions which are considered to * be too large to inline. See include/linux/plist.h for further * information. */ #include <linux/bug.h> #include <linux/plist.h> #ifdef CONFIG_DEBUG_PLIST static struct plist_head test_head; static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { WARN(n->prev != p || p->next != n, "top: %p, n: %p, p: %p\n" "prev: %p, n: %p, p: %p\n" "next: %p, n: %p, p: %p\n", t, t->next, t->prev, p, p->next, p->prev, n, n->next, n->prev); } static void plist_check_list(struct list_head *top) { struct list_head *prev = top, *next = top->next; plist_check_prev_next(top, prev, next); while (next != top) { prev = next; next = prev->next; plist_check_prev_next(top, prev, next); } } static void plist_check_head(struct plist_head *head) { if (!plist_head_empty(head)) plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); } #else # define plist_check_head(h) do { } while (0) #endif /** * plist_add - add @node to @head * * @node: &struct plist_node pointer * @head: &struct plist_head pointer */ void plist_add(struct plist_node *node, struct plist_head *head) { struct plist_node *first, *iter, *prev = NULL, *last, *reverse_iter; struct list_head *node_next = &head->node_list; plist_check_head(head); WARN_ON(!plist_node_empty(node)); WARN_ON(!list_empty(&node->prio_list)); if (plist_head_empty(head)) goto ins_node; first = iter = plist_first(head); last = reverse_iter = list_entry(first->prio_list.prev, struct plist_node, prio_list); do { if (node->prio < iter->prio) { node_next = &iter->node_list; break; } else if (node->prio >= reverse_iter->prio) { prev = reverse_iter; iter = list_entry(reverse_iter->prio_list.next, struct plist_node, prio_list); if (likely(reverse_iter != last)) node_next = &iter->node_list; break; } prev = iter; iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); reverse_iter = list_entry(reverse_iter->prio_list.prev, struct plist_node, prio_list); } while (iter != first); if (!prev || prev->prio != node->prio) list_add_tail(&node->prio_list, &iter->prio_list); ins_node: list_add_tail(&node->node_list, node_next); plist_check_head(head); } /** * plist_del - Remove a @node from plist. * * @node: &struct plist_node pointer - entry to be removed * @head: &struct plist_head pointer - list head */ void plist_del(struct plist_node *node, struct plist_head *head) { plist_check_head(head); if (!list_empty(&node->prio_list)) { if (node->node_list.next != &head->node_list) { struct plist_node *next; next = list_entry(node->node_list.next, struct plist_node, node_list); /* add the next plist_node into prio_list */ if (list_empty(&next->prio_list)) list_add(&next->prio_list, &node->prio_list); } list_del_init(&node->prio_list); } list_del_init(&node->node_list); plist_check_head(head); } /** * plist_requeue - Requeue @node at end of same-prio entries. * * This is essentially an optimized plist_del() followed by * plist_add(). It moves an entry already in the plist to * after any other same-priority entries. * * @node: &struct plist_node pointer - entry to be moved * @head: &struct plist_head pointer - list head */ void plist_requeue(struct plist_node *node, struct plist_head *head) { struct plist_node *iter; struct list_head *node_next = &head->node_list; plist_check_head(head); BUG_ON(plist_head_empty(head)); BUG_ON(plist_node_empty(node)); if (node == plist_last(head)) return; iter = plist_next(node); if (node->prio != iter->prio) return; plist_del(node, head); /* * After plist_del(), iter is the replacement of the node. If the node * was on prio_list, take shortcut to find node_next instead of looping. */ if (!list_empty(&iter->prio_list)) { iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); node_next = &iter->node_list; goto queue; } plist_for_each_continue(iter, head) { if (node->prio != iter->prio) { node_next = &iter->node_list; break; } } queue: list_add_tail(&node->node_list, node_next); plist_check_head(head); } #ifdef CONFIG_DEBUG_PLIST #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/module.h> #include <linux/init.h> static struct plist_node __initdata test_node[241]; static void __init plist_test_check(int nr_expect) { struct plist_node *first, *prio_pos, *node_pos; if (plist_head_empty(&test_head)) { BUG_ON(nr_expect != 0); return; } prio_pos = first = plist_first(&test_head); plist_for_each(node_pos, &test_head) { if (nr_expect-- < 0) break; if (node_pos == first) continue; if (node_pos->prio == prio_pos->prio) { BUG_ON(!list_empty(&node_pos->prio_list)); continue; } BUG_ON(prio_pos->prio > node_pos->prio); BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list); prio_pos = node_pos; } BUG_ON(nr_expect != 0); BUG_ON(prio_pos->prio_list.next != &first->prio_list); } static void __init plist_test_requeue(struct plist_node *node) { plist_requeue(node, &test_head); if (node != plist_last(&test_head)) BUG_ON(node->prio == plist_next(node)->prio); } static int __init plist_test(void) { int nr_expect = 0, i, loop; unsigned int r = local_clock(); printk(KERN_DEBUG "start plist test\n"); plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); for (loop = 0; loop < 1000; loop++) { r = r * 193939 % 47629; i = r % ARRAY_SIZE(test_node); if (plist_node_empty(test_node + i)) { r = r * 193939 % 47629; test_node[i].prio = r % 99; plist_add(test_node + i, &test_head); nr_expect++; } else { plist_del(test_node + i, &test_head); nr_expect--; } plist_test_check(nr_expect); if (!plist_node_empty(test_node + i)) { plist_test_requeue(test_node + i); plist_test_check(nr_expect); } } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) continue; plist_del(test_node + i, &test_head); nr_expect--; plist_test_check(nr_expect); } printk(KERN_DEBUG "end plist test\n"); /* Worst case test for plist_add() */ unsigned int test_data[241]; for (i = 0; i < ARRAY_SIZE(test_data); i++) test_data[i] = i; ktime_t start, end, time_elapsed = 0; plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) { plist_node_init(test_node + i, 0); test_node[i].prio = test_data[i]; } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) { start = ktime_get(); plist_add(test_node + i, &test_head); end = ktime_get(); time_elapsed += (end - start); } } pr_debug("plist_add worst case test time elapsed %lld\n", time_elapsed); return 0; } module_init(plist_test); #endif
235 234 51 7 127 122 8 44 9 118 169 234 204 44 247 248 77 99 1 18 1 8 1 19 7 82 247 245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/mount.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <uapi/linux/mount.h> #include "common.h" /* String table for special mount operations. */ static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = { [TOMOYO_MOUNT_BIND] = "--bind", [TOMOYO_MOUNT_MOVE] = "--move", [TOMOYO_MOUNT_REMOUNT] = "--remount", [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable", [TOMOYO_MOUNT_MAKE_PRIVATE] = "--make-private", [TOMOYO_MOUNT_MAKE_SLAVE] = "--make-slave", [TOMOYO_MOUNT_MAKE_SHARED] = "--make-shared", }; /** * tomoyo_audit_mount_log - Audit mount log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mount_log(struct tomoyo_request_info *r) __must_hold_shared(&tomoyo_ss) { return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n", r->param.mount.dev->name, r->param.mount.dir->name, r->param.mount.type->name, r->param.mount.flags); } /** * tomoyo_check_mount_acl - Check permission for path path path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mount_acl *acl = container_of(ptr, typeof(*acl), head); return tomoyo_compare_number_union(r->param.mount.flags, &acl->flags) && tomoyo_compare_name_union(r->param.mount.type, &acl->fs_type) && tomoyo_compare_name_union(r->param.mount.dir, &acl->dir_name) && (!r->param.mount.need_dev || tomoyo_compare_name_union(r->param.mount.dev, &acl->dev_name)); } /** * tomoyo_mount_acl - Check permission for mount() operation. * * @r: Pointer to "struct tomoyo_request_info". * @dev_name: Name of device file. Maybe NULL. * @dir: Pointer to "struct path". * @type: Name of filesystem type. * @flags: Mount options. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_mount_acl(struct tomoyo_request_info *r, const char *dev_name, const struct path *dir, const char *type, unsigned long flags) __must_hold_shared(&tomoyo_ss) { struct tomoyo_obj_info obj = { }; struct path path; struct file_system_type *fstype = NULL; const char *requested_type = NULL; const char *requested_dir_name = NULL; const char *requested_dev_name = NULL; struct tomoyo_path_info rtype; struct tomoyo_path_info rdev; struct tomoyo_path_info rdir; int need_dev = 0; int error = -ENOMEM; r->obj = &obj; /* Get fstype. */ requested_type = tomoyo_encode(type); if (!requested_type) goto out; rtype.name = requested_type; tomoyo_fill_path_info(&rtype); /* Get mount point. */ obj.path2 = *dir; requested_dir_name = tomoyo_realpath_from_path(dir); if (!requested_dir_name) { error = -ENOMEM; goto out; } rdir.name = requested_dir_name; tomoyo_fill_path_info(&rdir); /* Compare fs name. */ if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] || type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) { need_dev = -1; /* dev_name is a directory */ } else { fstype = get_fs_type(type); if (!fstype) { error = -ENODEV; goto out; } if (fstype->fs_flags & FS_REQUIRES_DEV) /* dev_name is a block device file. */ need_dev = 1; } if (need_dev) { /* Get mount point or device file. */ if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) { error = -ENOENT; goto out; } obj.path1 = path; requested_dev_name = tomoyo_realpath_from_path(&path); if (!requested_dev_name) { error = -ENOENT; goto out; } } else { /* Map dev_name to "<NULL>" if no dev_name given. */ if (!dev_name) dev_name = "<NULL>"; requested_dev_name = tomoyo_encode(dev_name); if (!requested_dev_name) { error = -ENOMEM; goto out; } } rdev.name = requested_dev_name; tomoyo_fill_path_info(&rdev); r->param_type = TOMOYO_TYPE_MOUNT_ACL; r->param.mount.need_dev = need_dev; r->param.mount.dev = &rdev; r->param.mount.dir = &rdir; r->param.mount.type = &rtype; r->param.mount.flags = flags; do { tomoyo_check_acl(r, tomoyo_check_mount_acl); error = tomoyo_audit_mount_log(r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(requested_dev_name); kfree(requested_dir_name); if (fstype) put_filesystem(fstype); kfree(requested_type); /* Drop refcount obtained by kern_path(). */ if (obj.path1.dentry) path_put(&obj.path1); return error; } /** * tomoyo_mount_permission - Check permission for mount() operation. * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data_page: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page) { struct tomoyo_request_info r; int error; int idx; if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT) == TOMOYO_CONFIG_DISABLED) return 0; if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; if (flags & MS_REMOUNT) { type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]; flags &= ~MS_REMOUNT; } else if (flags & MS_BIND) { type = tomoyo_mounts[TOMOYO_MOUNT_BIND]; flags &= ~MS_BIND; } else if (flags & MS_SHARED) { if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]; flags &= ~MS_SHARED; } else if (flags & MS_PRIVATE) { if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE]; flags &= ~MS_PRIVATE; } else if (flags & MS_SLAVE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE]; flags &= ~MS_SLAVE; } else if (flags & MS_UNBINDABLE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE]; flags &= ~MS_UNBINDABLE; } else if (flags & MS_MOVE) { type = tomoyo_mounts[TOMOYO_MOUNT_MOVE]; flags &= ~MS_MOVE; } if (!type) type = "<NULL>"; idx = tomoyo_read_lock(); error = tomoyo_mount_acl(&r, dev_name, path, type, flags); tomoyo_read_unlock(idx); return error; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 #include <linux/gfp.h> #include <linux/initrd.h> #include <linux/ioport.h> #include <linux/swap.h> #include <linux/memblock.h> #include <linux/swapfile.h> #include <linux/swapops.h> #include <linux/kmemleak.h> #include <linux/sched/task.h> #include <linux/execmem.h> #include <asm/set_memory.h> #include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> #include <asm/page_types.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ #include <asm/kaslr.h> #include <asm/hypervisor.h> #include <asm/cpufeature.h> #include <asm/pti.h> #include <asm/text-patching.h> #include <asm/memtype.h> #include <asm/mmu_context.h> /* * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y. */ #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> #include "mm_internal.h" /* * Tables translating between page_cache_type_t and pte encoding. * * The default values are defined statically as minimal supported mode; * WC and WT fall back to UC-. pat_init() updates these values to support * more cache modes, WC and WT, when it is safe to do so. See pat_init() * for the details. Note, __early_ioremap() used during early boot-time * takes pgprot_t (pte encoding) and does not use these tables. * * Index into __cachemode2pte_tbl[] is the cachemode. * * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; unsigned long cachemode2protval(enum page_cache_mode pcm) { if (likely(pcm == 0)) return 0; return __cachemode2pte_tbl[pcm]; } EXPORT_SYMBOL(cachemode2protval); static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; /* * Check that the write-protect PAT entry is set for write-protect. * To do this without making assumptions how PAT has been set up (Xen has * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache * mode via the __cachemode2pte_tbl[] into protection bits (those protection * bits will select a cache mode of WP or better), and then translate the * protection bits back into the cache mode using __pte2cm_idx() and the * __pte2cachemode_tbl[] array. This will return the really used cache mode. */ bool x86_has_pat_wp(void) { uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; } enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) { unsigned long masked; masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; if (likely(masked == 0)) return 0; return __pte2cachemode_tbl[__pte2cm_idx(masked)]; } static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; static bool __initdata can_use_brk_pgt = true; /* * Pages returned are already directly mapped. * * Changing that is likely to break Xen, see commit: * * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve * * for detailed information. */ __ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; int i; if (after_bootmem) { unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { ret = memblock_phys_alloc_range( PAGE_SIZE * num, PAGE_SIZE, min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT); } if (!ret && can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) panic("alloc_low_pages: can not alloc memory"); pfn = ret >> PAGE_SHIFT; } else { pfn = pgt_buf_end; pgt_buf_end += num; } for (i = 0; i < num; i++) { void *adr; adr = __va((pfn + i) << PAGE_SHIFT); clear_page(adr); } return __va(pfn << PAGE_SHIFT); } /* * By default need to be able to allocate page tables below PGD firstly for * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. * With KASLR memory randomization, depending on the machine e820 memory and the * PUD alignment, twice that many pages may be needed when KASLR memory * randomization is enabled. */ #define INIT_PGD_PAGE_TABLES 4 #ifndef CONFIG_RANDOMIZE_MEMORY #define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) #else #define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES) #endif #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { unsigned long tables = INIT_PGT_BUF_SIZE; phys_addr_t base; base = __pa(extend_brk(tables, PAGE_SIZE)); pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); } int after_bootmem; early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); struct map_range { unsigned long start; unsigned long end; unsigned page_size_mask; }; static int page_size_mask; /* * Save some of cr4 feature set we're using (e.g. Pentium 4MB * enable and PPro Global page enable), so that any CPU's that boot * up after us can get the correct flags. Invoked on the boot CPU. */ static inline void cr4_set_bits_and_update_boot(unsigned long mask) { mmu_cr4_features |= mask; if (trampoline_cr4_features) *trampoline_cr4_features = mmu_cr4_features; cr4_set_bits(mask); } static void __init probe_page_size_mask(void) { /* * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; else direct_gbpages = 0; /* Enable PSE if available */ if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } /* By the default is everything supported: */ __default_kernel_pte_mask = __supported_pte_mask; /* Except when with PTI where the kernel is mostly non-Global: */ if (cpu_feature_enabled(X86_FEATURE_PTI)) __default_kernel_pte_mask &= ~_PAGE_GLOBAL; /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; } else { direct_gbpages = 0; } } /* * INVLPG may not properly flush Global entries on * these CPUs. New microcode fixes the issue. */ static const struct x86_cpu_id invlpg_miss_ids[] = { X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e), X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c), X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11), X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118), X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e), {} }; static void setup_pcid(void) { const struct x86_cpu_id *invlpg_miss_match; if (!IS_ENABLED(CONFIG_X86_64)) return; if (!boot_cpu_has(X86_FEATURE_PCID)) return; invlpg_miss_match = x86_match_cpu(invlpg_miss_ids); if (invlpg_miss_match && boot_cpu_data.microcode < invlpg_miss_match->driver_data) { pr_info("Incomplete global flushes, disabling PCID"); setup_clear_cpu_cap(X86_FEATURE_PCID); return; } if (boot_cpu_has(X86_FEATURE_PGE)) { /* * This can't be cr4_set_bits_and_update_boot() -- the * trampoline code can't handle CR4.PCIDE and it wouldn't * do any good anyway. Despite the name, * cr4_set_bits_and_update_boot() doesn't actually cause * the bits in question to remain set all the way through * the secondary boot asm. * * Instead, we brute-force it and set CR4.PCIDE manually in * start_secondary(). */ cr4_set_bits(X86_CR4_PCIDE); } else { /* * flush_tlb_all(), as currently implemented, won't work if * PCID is on but PGE is not. Since that combination * doesn't exist on real hardware, there's no reason to try * to fully support it, but it's polite to avoid corrupting * data if we're on an improperly configured VM. */ setup_clear_cpu_cap(X86_FEATURE_PCID); } } #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ #define NR_RANGE_MR 5 #endif static int __meminit save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, unsigned long page_size_mask) { if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR) panic("run out of range for init_memory_mapping\n"); mr[nr_range].start = start_pfn<<PAGE_SHIFT; mr[nr_range].end = end_pfn<<PAGE_SHIFT; mr[nr_range].page_size_mask = page_size_mask; nr_range++; } return nr_range; } /* * adjust the page_size_mask for small range to go with * big page size instead small one if nearby are ram too. */ static void __ref adjust_range_page_size_mask(struct map_range *mr, int nr_range) { int i; for (i = 0; i < nr_range; i++) { if ((page_size_mask & (1<<PG_LEVEL_2M)) && !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { unsigned long start = round_down(mr[i].start, PMD_SIZE); unsigned long end = round_up(mr[i].end, PMD_SIZE); #ifdef CONFIG_X86_32 if ((end >> PAGE_SHIFT) > max_low_pfn) continue; #endif if (memblock_is_region_memory(start, end - start)) mr[i].page_size_mask |= 1<<PG_LEVEL_2M; } if ((page_size_mask & (1<<PG_LEVEL_1G)) && !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { unsigned long start = round_down(mr[i].start, PUD_SIZE); unsigned long end = round_up(mr[i].end, PUD_SIZE); if (memblock_is_region_memory(start, end - start)) mr[i].page_size_mask |= 1<<PG_LEVEL_1G; } } } static const char *page_size_string(struct map_range *mr) { static const char str_1g[] = "1G"; static const char str_2m[] = "2M"; static const char str_4m[] = "4M"; static const char str_4k[] = "4k"; if (mr->page_size_mask & (1<<PG_LEVEL_1G)) return str_1g; /* * 32-bit without PAE has a 4M large page size. * PG_LEVEL_2M is misnamed, but we can at least * print out the right size in the string. */ if (IS_ENABLED(CONFIG_X86_32) && !IS_ENABLED(CONFIG_X86_PAE) && mr->page_size_mask & (1<<PG_LEVEL_2M)) return str_4m; if (mr->page_size_mask & (1<<PG_LEVEL_2M)) return str_2m; return str_4k; } static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) { unsigned long start_pfn, end_pfn, limit_pfn; unsigned long pfn; int i; limit_pfn = PFN_DOWN(end); /* head if not big page alignment ? */ pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ if (pfn == 0) end_pfn = PFN_DOWN(PMD_SIZE); else end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif if (end_pfn > limit_pfn) end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pfn = end_pfn; } /* big page (2M) range */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); pfn = end_pfn; } #ifdef CONFIG_X86_64 /* big page (1G) range */ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); pfn = end_pfn; } /* tail is not big page (1G) alignment */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); pfn = end_pfn; } #endif /* tail is not big page (2M) alignment */ start_pfn = pfn; end_pfn = limit_pfn; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); if (!after_bootmem) adjust_range_page_size_mask(mr, nr_range); /* try to merge same page size and continuous */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { unsigned long old_start; if (mr[i].end != mr[i+1].start || mr[i].page_size_mask != mr[i+1].page_size_mask) continue; /* move it */ old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof(struct map_range)); mr[i--].start = old_start; nr_range--; } for (i = 0; i < nr_range; i++) pr_debug(" [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, page_size_string(&mr[i])); return nr_range; } struct range pfn_mapped[E820_MAX_ENTRIES]; int nr_pfn_mapped; static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) { nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, nr_pfn_mapped, start_pfn, end_pfn); nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); max_pfn_mapped = max(max_pfn_mapped, end_pfn); if (start_pfn < (1UL<<(32-PAGE_SHIFT))) max_low_pfn_mapped = max(max_low_pfn_mapped, min(end_pfn, 1UL<<(32-PAGE_SHIFT))); } bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) { int i; for (i = 0; i < nr_pfn_mapped; i++) if ((start_pfn >= pfn_mapped[i].start) && (end_pfn <= pfn_mapped[i].end)) return true; return false; } /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ unsigned long __ref init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot) { struct map_range mr[NR_RANGE_MR]; unsigned long ret = 0; int nr_range, i; pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", start, end - 1); memset(mr, 0, sizeof(mr)); nr_range = split_mem_range(mr, 0, start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask, prot); add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); return ret >> PAGE_SHIFT; } /* * We need to iterate through the E820 memory map and create direct mappings * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply * create direct mappings for all pfns from [0 to max_low_pfn) and * [4GB to max_pfn) because of possible memory holes in high addresses * that cannot be marked as UC by fixed/variable range MTRRs. * Depending on the alignment of E820 ranges, this may possibly result * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. * * init_mem_mapping() calls init_range_memory_mapping() with big range. * That range would have hole in the middle or ends, and only ram parts * will be mapped in init_range_memory_mapping(). */ static unsigned long __init init_range_memory_mapping( unsigned long r_start, unsigned long r_end) { unsigned long start_pfn, end_pfn; unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); if (start >= end) continue; /* * if it is overlapping with brk pgt, we need to * alloc pgt buf from memblock instead. */ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= min(end, (u64)pgt_buf_top<<PAGE_SHIFT); init_memory_mapping(start, end, PAGE_KERNEL); mapped_ram_size += end - start; can_use_brk_pgt = true; } return mapped_ram_size; } static unsigned long __init get_new_step_size(unsigned long step_size) { /* * Initial mapped size is PMD_SIZE (2M). * We can not set step_size to be PUD_SIZE (1G) yet. * In worse case, when we cross the 1G boundary, and * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) * to map 1G range with PTE. Hence we use one less than the * difference of page table level shifts. * * Don't need to worry about overflow in the top-down case, on 32bit, * when step_size is 0, round_down() returns 0 for start, and that * turns it into 0x100000000ULL. * In the bottom-up case, round_up(x, 0) returns 0 though too, which * needs to be taken into consideration by the code below. */ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); } /** * memory_map_top_down - Map [map_start, map_end) top down * @map_start: start address of the target memory range * @map_end: end address of the target memory range * * This function will setup direct mapping for memory range * [map_start, map_end) in top-down. That said, the page tables * will be allocated at the end of the memory, and we map the * memory in top-down. */ static void __init memory_map_top_down(unsigned long map_start, unsigned long map_end) { unsigned long real_end, last_start; unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; /* * Systems that have many reserved areas near top of the memory, * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will * require lots of 4K mappings which may exhaust pgt_buf. * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure * there is enough mapped memory that can be allocated from * memblock. */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); if (!addr) { pr_warn("Failed to release memory for alloc_low_pages()"); real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE)); } else { memblock_phys_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; } /* step_size need to be small so pgt_buf from BRK could cover it */ step_size = PMD_SIZE; max_pfn_mapped = 0; /* will get exact value next */ min_pfn_mapped = real_end >> PAGE_SHIFT; last_start = real_end; /* * We start from the top (end of memory) and go to the bottom. * The memblock_find_in_range() gets us a block of RAM from the * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table. */ while (last_start > map_start) { unsigned long start; if (last_start > step_size) { start = round_down(last_start - 1, step_size); if (start < map_start) start = map_start; } else start = map_start; mapped_ram_size += init_range_memory_mapping(start, last_start); last_start = start; min_pfn_mapped = last_start >> PAGE_SHIFT; if (mapped_ram_size >= step_size) step_size = get_new_step_size(step_size); } if (real_end < map_end) init_range_memory_mapping(real_end, map_end); } /** * memory_map_bottom_up - Map [map_start, map_end) bottom up * @map_start: start address of the target memory range * @map_end: end address of the target memory range * * This function will setup direct mapping for memory range * [map_start, map_end) in bottom-up. Since we have limited the * bottom-up allocation above the kernel, the page tables will * be allocated just above the kernel and we map the memory * in [map_start, map_end) in bottom-up. */ static void __init memory_map_bottom_up(unsigned long map_start, unsigned long map_end) { unsigned long next, start; unsigned long mapped_ram_size = 0; /* step_size need to be small so pgt_buf from BRK could cover it */ unsigned long step_size = PMD_SIZE; start = map_start; min_pfn_mapped = start >> PAGE_SHIFT; /* * We start from the bottom (@map_start) and go to the top (@map_end). * The memblock_find_in_range() gets us a block of RAM from the * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table. */ while (start < map_end) { if (step_size && map_end - start > step_size) { next = round_up(start + 1, step_size); if (next > map_end) next = map_end; } else { next = map_end; } mapped_ram_size += init_range_memory_mapping(start, next); start = next; if (mapped_ram_size >= step_size) step_size = get_new_step_size(step_size); } } /* * The real mode trampoline, which is required for bootstrapping CPUs * occupies only a small area under the low 1MB. See reserve_real_mode() * for details. * * If KASLR is disabled the first PGD entry of the direct mapping is copied * to map the real mode trampoline. * * If KASLR is enabled, copy only the PUD which covers the low 1MB * area. This limits the randomization granularity to 1GB for both 4-level * and 5-level paging. */ static void __init init_trampoline(void) { #ifdef CONFIG_X86_64 /* * The code below will alias kernel page-tables in the user-range of the * address space, including the Global bit. So global TLB entries will * be created when using the trampoline page-table. */ if (!kaslr_memory_enabled()) trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; else init_trampoline_kaslr(); #endif } void __init init_mem_mapping(void) { unsigned long end; pti_check_boottime_disable(); probe_page_size_mask(); setup_pcid(); #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; #else end = max_low_pfn << PAGE_SHIFT; #endif /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL); /* Init the trampoline, possibly with KASLR memory offset */ init_trampoline(); /* * If the allocation is in bottom-up direction, we setup direct mapping * in bottom-up, otherwise we setup direct mapping in top-down. */ if (memblock_bottom_up()) { unsigned long kernel_end = __pa_symbol(_end); /* * we need two separate calls here. This is because we want to * allocate page tables above the kernel. So we first map * [kernel_end, end) to make memory above the kernel be mapped * as soon as possible. And then use page tables allocated above * the kernel to map [ISA_END_ADDRESS, kernel_end). */ memory_map_bottom_up(kernel_end, end); memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); } else { memory_map_top_down(ISA_END_ADDRESS, end); } #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preserve max_low_pfn ?*/ max_low_pfn = max_pfn; } #else early_ioremap_page_table_range_init(); #endif load_cr3(swapper_pg_dir); __flush_tlb_all(); x86_init.hyper.init_mem_mapping(); early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } /* * Initialize an mm_struct to be used during poking and a pointer to be used * during patching. */ void __init poking_init(void) { spinlock_t *ptl; pte_t *ptep; text_poke_mm = mm_alloc(); BUG_ON(!text_poke_mm); /* Xen PV guests need the PGD to be pinned. */ paravirt_enter_mmap(text_poke_mm); set_notrack_mm(text_poke_mm); /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one. */ text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); if (((text_poke_mm_addr + PAGE_SIZE) & ~PMD_MASK) == 0) text_poke_mm_addr += PAGE_SIZE; /* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail. */ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); BUG_ON(!ptep); pte_unmap_unlock(ptep, ptl); } /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * * On x86, access has to be given to the first megabyte of RAM because that * area traditionally contains BIOS code and data regions used by X, dosemu, * and similar apps. Since they map the entire memory range, the whole range * must be allowed (for mapping), but any areas that would otherwise be * disallowed are flagged as being "zero filled" instead of rejected. * Access has to be given to non-kernel-ram areas as well, these contain the * PCI mmio resources as well as potential bios/acpi data regions. */ int devmem_is_allowed(unsigned long pagenr) { if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) != REGION_DISJOINT) { /* * For disallowed memory regions in the low 1MB range, * request that the page be shown as all zeros. */ if (pagenr < 256) return 2; return 0; } /* * This must follow RAM test, since System RAM is considered a * restricted resource under CONFIG_STRICT_DEVMEM. */ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { /* Low 1MB bypasses iomem restrictions. */ if (pagenr < 256) return 1; return 0; } return 1; } void free_init_pages(const char *what, unsigned long begin, unsigned long end) { unsigned long begin_aligned, end_aligned; /* Make sure boundaries are page aligned */ begin_aligned = PAGE_ALIGN(begin); end_aligned = end & PAGE_MASK; if (WARN_ON(begin_aligned != begin || end_aligned != end)) { begin = begin_aligned; end = end_aligned; } if (begin >= end) return; /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault: */ if (debug_pagealloc_enabled()) { pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", begin, end - 1); /* * Inform kmemleak about the hole in the memory since the * corresponding pages will be unmapped. */ kmemleak_free_part((void *)begin, end - begin); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that * writeable and non-executable first. */ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); } } /* * begin/end can be in the direct map or the "high kernel mapping" * used for the kernel image only. free_init_pages() will do the * right thing for either kind of address. */ void free_kernel_image_pages(const char *what, void *begin, void *end) { unsigned long begin_ul = (unsigned long)begin; unsigned long end_ul = (unsigned long)end; unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; free_init_pages(what, begin_ul, end_ul); /* * PTI maps some of the kernel into userspace. For performance, * this includes some kernel areas that do not contain secrets. * Those areas might be adjacent to the parts of the kernel image * being freed, which may contain secrets. Remove the "high kernel * image mapping" for these freed areas, ensuring they are not even * potentially vulnerable to Meltdown regardless of the specific * optimizations PTI is currently using. * * The "noalias" prevents unmapping the direct map alias which is * needed to access the freed pages. * * This is only valid for 64bit kernels. 32bit has only one mapping * which can't be treated in this way for obvious reasons. */ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) set_memory_np_noalias(begin_ul, len_pages); } void __ref free_initmem(void) { e820__reallocate_tables(); mem_encrypt_free_decrypted_mem(); free_kernel_image_pages("unused kernel image (initmem)", &__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, * decompressor could be confused by aligned initrd_end * We already reserve the end partial page before in * - i386_start_kernel() * - x86_64_start_kernel() * - relocate_initrd() * So here We can do PAGE_ALIGN() safely to get partial page to be freed */ free_init_pages("initrd", start, PAGE_ALIGN(end)); } #endif void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif } __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; #ifdef CONFIG_ADDRESS_MASKING DEFINE_PER_CPU(u64, tlbstate_untag_mask); EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask); #endif void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); __pte2cachemode_tbl[entry] = cache; } #ifdef CONFIG_SWAP unsigned long arch_max_swapfile_size(void) { unsigned long pages; pages = generic_max_swapfile_size(); if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ unsigned long long l1tf_limit = l1tf_pfn_limit(); /* * We encode swap offsets also with 3 bits below those for pfn * which makes the usable limit higher. */ #if CONFIG_PGTABLE_LEVELS > 2 l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; #endif pages = min_t(unsigned long long, l1tf_limit, pages); } return pages; } #endif #ifdef CONFIG_EXECMEM static struct execmem_info execmem_info __ro_after_init; #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX void execmem_fill_trapping_insns(void *ptr, size_t size) { memset(ptr, INT3_INSN_OPCODE, size); } #endif struct execmem_info __init *execmem_arch_setup(void) { unsigned long start, offset = 0; enum execmem_range_flags flags; pgprot_t pgprot; if (kaslr_enabled()) offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; start = MODULES_VADDR + offset; if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) && cpu_feature_enabled(X86_FEATURE_PSE)) { pgprot = PAGE_KERNEL_ROX; flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE; } else { pgprot = PAGE_KERNEL; flags = EXECMEM_KASAN_SHADOW; } execmem_info = (struct execmem_info){ .ranges = { [EXECMEM_MODULE_TEXT] = { .flags = flags, .start = start, .end = MODULES_END, .pgprot = pgprot, .alignment = MODULE_ALIGN, }, [EXECMEM_KPROBES] = { .flags = flags, .start = start, .end = MODULES_END, .pgprot = PAGE_KERNEL_ROX, .alignment = MODULE_ALIGN, }, [EXECMEM_FTRACE] = { .flags = flags, .start = start, .end = MODULES_END, .pgprot = pgprot, .alignment = MODULE_ALIGN, }, [EXECMEM_BPF] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, .pgprot = PAGE_KERNEL, .alignment = MODULE_ALIGN, }, [EXECMEM_MODULE_DATA] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, .pgprot = PAGE_KERNEL, .alignment = MODULE_ALIGN, }, }, }; return &execmem_info; } #endif /* CONFIG_EXECMEM */
70 5 149 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UNALIGNED_H #define __LINUX_UNALIGNED_H /* * This is the most generic implementation of unaligned accesses * and should work almost anywhere. */ #include <linux/unaligned/packed_struct.h> #include <asm/byteorder.h> #include <vdso/unaligned.h> #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) #define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr)) static inline u16 get_unaligned_le16(const void *p) { return le16_to_cpu(__get_unaligned_t(__le16, p)); } static inline u32 get_unaligned_le32(const void *p) { return le32_to_cpu(__get_unaligned_t(__le32, p)); } static inline u64 get_unaligned_le64(const void *p) { return le64_to_cpu(__get_unaligned_t(__le64, p)); } static inline void put_unaligned_le16(u16 val, void *p) { __put_unaligned_t(__le16, cpu_to_le16(val), p); } static inline void put_unaligned_le32(u32 val, void *p) { __put_unaligned_t(__le32, cpu_to_le32(val), p); } static inline void put_unaligned_le64(u64 val, void *p) { __put_unaligned_t(__le64, cpu_to_le64(val), p); } static inline u16 get_unaligned_be16(const void *p) { return be16_to_cpu(__get_unaligned_t(__be16, p)); } static inline u32 get_unaligned_be32(const void *p) { return be32_to_cpu(__get_unaligned_t(__be32, p)); } static inline u64 get_unaligned_be64(const void *p) { return be64_to_cpu(__get_unaligned_t(__be64, p)); } static inline void put_unaligned_be16(u16 val, void *p) { __put_unaligned_t(__be16, cpu_to_be16(val), p); } static inline void put_unaligned_be32(u32 val, void *p) { __put_unaligned_t(__be32, cpu_to_be32(val), p); } static inline void put_unaligned_be64(u64 val, void *p) { __put_unaligned_t(__be64, cpu_to_be64(val), p); } static inline u32 __get_unaligned_be24(const u8 *p) { return p[0] << 16 | p[1] << 8 | p[2]; } static inline u32 get_unaligned_be24(const void *p) { return __get_unaligned_be24(p); } static inline u32 __get_unaligned_le24(const u8 *p) { return p[0] | p[1] << 8 | p[2] << 16; } static inline u32 get_unaligned_le24(const void *p) { return __get_unaligned_le24(p); } static inline void __put_unaligned_be24(const u32 val, u8 *p) { *p++ = (val >> 16) & 0xff; *p++ = (val >> 8) & 0xff; *p++ = val & 0xff; } static inline void put_unaligned_be24(const u32 val, void *p) { __put_unaligned_be24(val, p); } static inline void __put_unaligned_le24(const u32 val, u8 *p) { *p++ = val & 0xff; *p++ = (val >> 8) & 0xff; *p++ = (val >> 16) & 0xff; } static inline void put_unaligned_le24(const u32 val, void *p) { __put_unaligned_le24(val, p); } static inline void __put_unaligned_be48(const u64 val, u8 *p) { *p++ = (val >> 40) & 0xff; *p++ = (val >> 32) & 0xff; *p++ = (val >> 24) & 0xff; *p++ = (val >> 16) & 0xff; *p++ = (val >> 8) & 0xff; *p++ = val & 0xff; } static inline void put_unaligned_be48(const u64 val, void *p) { __put_unaligned_be48(val, p); } static inline u64 __get_unaligned_be48(const u8 *p) { return (u64)p[0] << 40 | (u64)p[1] << 32 | (u64)p[2] << 24 | p[3] << 16 | p[4] << 8 | p[5]; } static inline u64 get_unaligned_be48(const void *p) { return __get_unaligned_be48(p); } #endif /* __LINUX_UNALIGNED_H */
20 21 21 21 21 1 21 13 8 5 2 12 6 15 11 4 2 2 13 12 8 5 15 11 4 15 5 15 5 5 5 21 21 21 21 21 21 21 10 1 2 2 6 11 11 5 16 16 16 16 21 21 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP fragmentation functionality. * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. * David S. Miller : Begin massive cleanup... * Andi Kleen : Add sysctls. * xxxx : Overlapfrag bug. * Ultima : ip_expire() kernel panic. * Bill Hawes : Frag accounting and evictor fixes. * John McDonald : 0 length frag bug. * Alexey Kuznetsov: SMP races, threading, cleanup. * Patrick McHardy : LRU queue of frag heads for evictor. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/jiffies.h> #include <linux/skbuff.h> #include <linux/list.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <net/route.h> #include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> #include <net/inet_frag.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> #include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ static const char ip_frag_cache_name[] = "ip4-frags"; /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; }; static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); const struct frag_v4_compare_key *key = a; struct net *net = q->fqdir->net; struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; if (q->fqdir->max_dist) { rcu_read_lock(); p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); if (p && !refcount_inc_not_zero(&p->refcnt)) p = NULL; rcu_read_unlock(); } qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; qp = container_of(q, struct ipq, q); if (qp->peer) inet_putpeer(qp->peer); } static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END) || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ static void ip_expire(struct timer_list *t) { enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; struct inet_frag_queue *frag = timer_container_of(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int refs = 1; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; rcu_read_lock(); spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) goto out; qp->q.flags |= INET_FRAG_DROP; inet_frag_kill(&qp->q, &refs); /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ if (READ_ONCE(qp->q.fqdir->dead)) { inet_frag_queue_flush(&qp->q, 0); goto out; } __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&qp->q); if (!head) goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); reason = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), head->dev); if (reason) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, reason); inet_frag_putn(&qp->q, refs); } /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { struct frag_v4_compare_key key = { .saddr = iph->saddr, .daddr = iph->daddr, .user = user, .vif = vif, .id = iph->id, .protocol = iph->protocol, }; struct inet_frag_queue *q; q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; return container_of(q, struct ipq, q); } /* Is the fragment too far ahead to be part of ipq? */ static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; if (!peer || !max) return 0; start = qp->rid; end = atomic_inc_return(&peer->rid); qp->rid = end; rc = qp->q.fragments_tail && (end - start) > max; if (rc) __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } static int ip_frag_reinit(struct ipq *qp) { if (!mod_timer_pending(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) return -ETIMEDOUT; inet_frag_queue_flush(&qp->q, SKB_DROP_REASON_FRAG_TOO_FAR); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; qp->iif = 0; qp->ecn = 0; return 0; } /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (qp->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { inet_frag_kill(&qp->q, refs); goto err; } ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /* Is this the final fragment? */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto discard_qp; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { end &= ~7; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ if (qp->q.flags & INET_FRAG_LAST_IN) goto discard_qp; qp->q.len = end; } } if (end == offset) goto discard_qp; err = -ENOMEM; if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto discard_qp; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_qp; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = qp->q.fragments_tail; err = inet_frag_queue_insert(&qp->q, skb, offset, end); if (err) goto insert_error; if (dev) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; qp->q.tstamp_type = skb->tstamp_type; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size) qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && fragsize > qp->max_df_size) qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; if (err) inet_frag_kill(&qp->q, refs); return err; } skb_dst_drop(skb); skb_orphan(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q, refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } static bool ip_frag_coalesce_ok(const struct ipq *qp) { return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; } /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; u8 ecn; inet_frag_kill(&qp->q, refs); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; } /* Make the one we just received the head. */ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); if (!reasm_data) goto out_nomem; len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; inet_frag_reasm_finish(&qp->q, skb, reasm_data, ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a * call to ip_fragment to avoid forwarding a DF-skb of size s while * original sender only sent fragments of size f (where f < s). * * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest * frag seen to avoid sending tiny DF-fragments in case skb was built * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; } ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; return 0; out_nomem: net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; } /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev; struct ipq *qp; int vif; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ rcu_read_lock(); dev = skb->dev ? : skb_dst_dev_rcu(skb); vif = l3mdev_master_ifindex_rcu(dev); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret, refs = 0; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); rcu_read_unlock(); inet_frag_putn(&qp->q, refs); return ret; } rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; netoff = skb_network_offset(skb); if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { kfree_skb(skb); return NULL; } if (pskb_trim_rcsum(skb, netoff + len)) { kfree_skb(skb); return NULL; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } } return skb; } EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, }; /* secret interval has been deprecated */ static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv4.fqdir->high_thresh; table[0].extra1 = &net->ipv4.fqdir->low_thresh; table[1].data = &net->ipv4.fqdir->low_thresh; table[1].extra2 = &net->ipv4.fqdir->high_thresh; table[2].data = &net->ipv4.fqdir->timeout; table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl_sz(net, "net/ipv4", table, ARRAY_SIZE(ip4_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv4.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } static void ip4_frags_ns_ctl_unregister(struct net *net) { } static void __init ip4_frags_ctl_register(void) { } #endif static int __net_init ipv4_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); if (res < 0) return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for * the real memory usage, by measuring both the size of frag * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) * and the SKB's truesize. * * A 64K fragment consumes 129736 bytes (44*2944)+200 * (1500 truesize == 2944, sizeof(struct ipq) == 200) * * We will commit 4MB at one time. Should we cross that limit * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ net->ipv4.fqdir->timeout = IP_FRAG_TIME; net->ipv4.fqdir->max_dist = 64; res = ip4_frags_ns_ctl_register(net); if (res < 0) fqdir_exit(net->ipv4.fqdir); return res; } static void __net_exit ipv4_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv4.fqdir); } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .pre_exit = ipv4_frags_pre_exit_net, .exit = ipv4_frags_exit_net, }; static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v4, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v4_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params ip4_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .key_offset = offsetof(struct inet_frag_queue, key), .key_len = sizeof(struct frag_v4_compare_key), .hashfn = ip4_key_hashfn, .obj_hashfn = ip4_obj_hashfn, .obj_cmpfn = ip4_obj_cmpfn, .automatic_shrinking = true, }; void __init ipfrag_init(void) { ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); }
504 505 503 503 502 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET implementation * * Authors: * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> */ #include <linux/types.h> #include <linux/module.h> #include <linux/in.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/sock_reuseport.h> int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct flowi4 *fl4; struct rtable *rt; __be32 saddr; int oif; int err; if (addr_len < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; sk_dst_reset(sk); oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!oif) { oif = READ_ONCE(inet->uc_index); } fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif, sk->sk_protocol, inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); goto out; } if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); err = -EACCES; goto out; } /* Update addresses before rehashing */ inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); atomic_set(&inet->inet_id, get_random_u16()); sk_dst_set(sk, &rt->dst); err = 0; out: return err; } EXPORT_SYMBOL(__ip4_datagram_connect); int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip4_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL(ip4_datagram_connect); /* Because UDP xmit path can manipulate sk_dst_cache without holding * socket lock, we need to use sk_dst_set() here, * even if we own the socket lock. */ void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) { rcu_read_unlock(); return; } inet_sk_init_flowi4(inet, &fl4); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
18 1 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FUTEX_H #define _FUTEX_H #include <linux/futex.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/cleanup.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> #endif #include <asm/futex.h> /* * Futex flags used to encode options to functions and preserve them across * restarts. */ #define FLAGS_SIZE_8 0x0000 #define FLAGS_SIZE_16 0x0001 #define FLAGS_SIZE_32 0x0002 #define FLAGS_SIZE_64 0x0003 #define FLAGS_SIZE_MASK 0x0003 #ifdef CONFIG_MMU # define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ # define FLAGS_SHARED 0x0000 #endif #define FLAGS_CLOCKRT 0x0020 #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 #define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) { unsigned int flags = FLAGS_SIZE_32; if (!(op & FUTEX_PRIVATE_FLAG)) flags |= FLAGS_SHARED; if (op & FUTEX_CLOCK_REALTIME) flags |= FLAGS_CLOCKRT; return flags; } #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) { unsigned int flags = flags2 & FUTEX2_SIZE_MASK; if (!(flags2 & FUTEX2_PRIVATE)) flags |= FLAGS_SHARED; if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; if (flags2 & FUTEX2_MPOL) flags |= FLAGS_MPOL; return flags; } static inline unsigned int futex_size(unsigned int flags) { return 1 << (flags & FLAGS_SIZE_MASK); } static inline bool futex_flags_valid(unsigned int flags) { /* Only 64bit futexes for 64bit code */ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) return false; } /* Only 32bit futexes are implemented -- for now */ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; /* * Must be able to represent both FUTEX_NO_NODE and every valid nodeid * in a futex word. */ if (flags & FLAGS_NUMA) { int bits = 8 * futex_size(flags); u64 max = ~0ULL; max >>= 64 - bits; if (nr_node_ids >= max) return false; } return true; } static inline bool futex_validate_input(unsigned int flags, u64 val) { int bits = 8 * futex_size(flags); if (bits < 64 && (val >> bits)) return false; return true; } #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else static inline bool should_fail_futex(bool fshared) { return false; } #endif /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task * waiting on a futex. */ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* * Priority Inheritance state: */ struct futex_pi_state { /* * list of 'owned' pi_state instances - these have to be * cleaned up in do_exit() if the task exits prematurely: */ struct list_head list; /* * The PI object: */ struct rt_mutex_base pi_mutex; struct task_struct *owner; refcount_t refcount; union futex_key key; } __randomize_layout; struct futex_q; typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); /** * struct futex_q - The hashed futex queue entry, one per waiting task * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @wake: the wake handler for this queue * @wake_data: data associated with the wake handler * @key: the key the futex is hashed on * @pi_state: optional priority inheritance state * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via * the rt_mutex code. See futex_unqueue_pi(). */ struct futex_q { struct plist_node list; struct task_struct *task; spinlock_t *lock_ptr; futex_wake_fn *wake; void *wake_data; union futex_key key; struct futex_pi_state *pi_state; struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; bool drop_hb_ref; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif } __randomize_layout; extern const struct futex_q futex_q_init; enum futex_access { FUTEX_READ, FUTEX_WRITE }; extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern void futex_q_lockptr_lock(struct futex_q *q); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); #ifdef CONFIG_FUTEX_PRIVATE_HASH extern void futex_hash_get(struct futex_hash_bucket *hb); extern void futex_hash_put(struct futex_hash_bucket *hb); extern struct futex_private_hash *futex_private_hash(void); extern void futex_private_hash_put(struct futex_private_hash *fph); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static inline void futex_hash_get(struct futex_hash_bucket *hb) { } static inline void futex_hash_put(struct futex_hash_bucket *hb) { } static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } static inline void futex_private_hash_put(struct futex_private_hash *fph) { } #endif DEFINE_CLASS(hb, struct futex_hash_bucket *, if (_T) futex_hash_put(_T), futex_hash(key), union futex_key *key); DEFINE_CLASS(private_hash, struct futex_private_hash *, if (_T) futex_private_hash_put(_T), futex_private_hash(), void); /** * futex_match - Check whether two futex keys are equal * @key1: Pointer to key1 * @key2: Pointer to key2 * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int futex_match(union futex_key *key1, union futex_key *key2) { return (key1 && key2 && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, union futex_key *key2, struct task_struct *task); extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern int fault_in_user_writeable(u32 __user *uaddr); extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } /* Read from user memory with pagefaults disabled */ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { guard(pagefault)(); return get_user_inline(*dest, from); } extern void __futex_unqueue(struct futex_q *q); extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The * exceptions involve the PI related operations, which may use futex_unqueue_pi() * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). * * Note that @task may be NULL, for async usage of futexes. */ static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) __releases(&hb->lock) { __futex_queue(q, hb, task); spin_unlock(&hb->lock); } extern void futex_unqueue_pi(struct futex_q *q); extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); /* * Reflects a new waiter being added to the waitqueue. */ static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_inc(&hb->waiters); /* * Full barrier (A), see the ordering comment above. */ smp_mb__after_atomic(); #endif } /* * Reflects a waiter being removed from the waitqueue by wakeup * paths. */ static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP atomic_dec(&hb->waiters); #endif } static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP /* * Full barrier (B), see the ordering comment above. */ smp_mb(); return atomic_read(&hb->waiters); #else return 1; #endif } extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb); extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, struct task_struct *task, struct task_struct **exiting, int set_waiters); extern int refill_pi_state_cache(void); extern void get_pi_state(struct futex_pi_state *pi_state); extern void put_pi_state(struct futex_pi_state *pi_state); extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); /* * Express the locking dependencies for lockdep: */ static inline void double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { if (hb1 > hb2) swap(hb1, hb2); spin_lock(&hb1->lock); if (hb1 != hb2) spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); } static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); } /* syscalls */ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset); extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); /** * struct futex_vector - Auxiliary struct for futex_waitv() * @w: Userspace provided data * @q: Kernel side data * * Struct used to build an array with all data need for futex_waitv() */ struct futex_vector { struct futex_waitv w; struct futex_q q; }; extern int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data); extern int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken); extern int futex_unqueue_multiple(struct futex_vector *v, int count); extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to); extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); #endif /* _FUTEX_H */
2020 2 1858 1880 274 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_RPS_H #define _NET_RPS_H #include <linux/types.h> #include <linux/static_key.h> #include <net/sock.h> #include <net/hotdata.h> #ifdef CONFIG_RPS extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; /* * This structure holds an RPS map which can be of variable length. The * map is an array of CPUs. */ struct rps_map { unsigned int len; struct rcu_head rcu; u16 cpus[]; }; #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) /* * The rps_dev_flow structure contains the mapping of a flow to a CPU, the * tail pointer for that CPU's input queue at the time of last enqueue, a * hardware filter index, and the hash of the flow if aRFS is enabled. */ struct rps_dev_flow { u16 cpu; u16 filter; unsigned int last_qtail; #ifdef CONFIG_RFS_ACCEL u32 hash; #endif }; #define RPS_NO_FILTER 0xffff /* * The rps_dev_flow_table structure contains a table of flow mappings. */ struct rps_dev_flow_table { u8 log; struct rcu_head rcu; struct rps_dev_flow flows[]; }; #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ ((_num) * sizeof(struct rps_dev_flow))) /* * The rps_sock_flow_table contains mappings of flows to the last CPU * on which they were processed by the application (set in recvmsg). * Each entry is a 32bit value. Upper part is the high-order bits * of flow hash, lower part is CPU number. * rps_cpu_mask is used to partition the space, depending on number of * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { struct rcu_head rcu; u32 mask; u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) #define RPS_NO_CPU 0xffff static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { unsigned int index = hash & table->mask; u32 val = hash & ~net_hotdata.rps_cpu_mask; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id(); /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in get_rps_cpu(). */ if (READ_ONCE(table->ents[index]) != val) WRITE_ONCE(table->ents[index], val); } static inline void _sock_rps_record_flow_hash(__u32 hash) { struct rps_sock_flow_table *sock_flow_table; if (!hash) return; rcu_read_lock(); sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); } static inline void _sock_rps_record_flow(const struct sock *sk) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * * TCP_ESTABLISHED does cover almost all states where RFS * might be useful, and is cheaper [1] than testing : * IPv4: inet_sk(sk)->inet_daddr * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) * OR an additional socket flag * [1] : sk_state and sk_prot are in the same cache line. */ if (sk->sk_state == TCP_ESTABLISHED) { /* This READ_ONCE() is paired with the WRITE_ONCE() * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). */ _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); } } static inline void _sock_rps_delete_flow(const struct sock *sk) { struct rps_sock_flow_table *table; u32 hash, index; hash = READ_ONCE(sk->sk_rxhash); if (!hash) return; rcu_read_lock(); table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (table) { index = hash & table->mask; if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) WRITE_ONCE(table->ents[index], RPS_NO_CPU); } rcu_read_unlock(); } #endif /* CONFIG_RPS */ static inline bool rfs_is_needed(void) { #ifdef CONFIG_RPS return static_branch_unlikely(&rfs_needed); #else return false; #endif } static inline void sock_rps_record_flow_hash(__u32 hash) { #ifdef CONFIG_RPS if (!rfs_is_needed()) return; _sock_rps_record_flow_hash(hash); #endif } static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS if (!rfs_is_needed()) return; _sock_rps_record_flow(sk); #endif } static inline void sock_rps_delete_flow(const struct sock *sk) { #ifdef CONFIG_RPS if (!rfs_is_needed()) return; _sock_rps_delete_flow(sk); #endif } static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) { #ifdef CONFIG_RPS return ++sd->input_queue_tail; #else return 0; #endif } static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) { #ifdef CONFIG_RPS WRITE_ONCE(*dest, tail); #endif } static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) { #ifdef CONFIG_RPS WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); #endif } static inline void rps_input_queue_head_incr(struct softnet_data *sd) { rps_input_queue_head_add(sd, 1); } #endif /* _NET_RPS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mmap #if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_H #include <linux/tracepoint.h> TRACE_EVENT(vm_unmapped_area, TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info), TP_ARGS(addr, info), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, total_vm) __field(unsigned long, flags) __field(unsigned long, length) __field(unsigned long, low_limit) __field(unsigned long, high_limit) __field(unsigned long, align_mask) __field(unsigned long, align_offset) ), TP_fast_assign( __entry->addr = addr; __entry->total_vm = current->mm->total_vm; __entry->flags = info->flags; __entry->length = info->length; __entry->low_limit = info->low_limit; __entry->high_limit = info->high_limit; __entry->align_mask = info->align_mask; __entry->align_offset = info->align_offset; ), TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx", IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr, IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0, __entry->total_vm, __entry->flags, __entry->length, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); TRACE_EVENT(exit_mmap, TP_PROTO(struct mm_struct *mm), TP_ARGS(mm), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(struct maple_tree *, mt) ), TP_fast_assign( __entry->mm = mm; __entry->mt = &mm->mm_mt; ), TP_printk("mt_mod %p, DESTROY", __entry->mt ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
16 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, }; static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward __read_mostly = true; module_param(forward, bool, 0000); static int iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; int err; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } static int __net_init iptable_filter_net_init(struct net *net) { if (!forward) return iptable_filter_table_init(net); return 0; } static void __net_exit iptable_filter_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) { ipt_unregister_table_exit(net, "filter"); } static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .pre_exit = iptable_filter_net_pre_exit, .exit = iptable_filter_net_exit, }; static int __init iptable_filter_init(void) { int ret = xt_register_template(&packet_filter, iptable_filter_table_init); if (ret < 0) return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); } ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) { xt_unregister_template(&packet_filter); kfree(filter_ops); return ret; } return 0; } static void __exit iptable_filter_fini(void) { unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); kfree(filter_ops); } module_init(iptable_filter_init); module_exit(iptable_filter_fini);
194 44 66 66 13 308 308 214 159 159 871 276 99 262 1186 61 703 337 998 49 16055 1297 12678 560 13080 968 1189 1362 265 268 268 35 14 11 2 19 28 41 221 12 211 24 114 1008 281 179 95 95 59 5 1313 222 554 916 1061 57 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 /* SPDX-License-Identifier: GPL-2.0 */ /* * net/dst.h Protocol independent destination cache definitions. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #ifndef _NET_DST_H #define _NET_DST_H #include <net/dst_ops.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/bug.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <linux/rcuref.h> #include <net/neighbour.h> #include <asm/processor.h> #include <linux/indirect_call_wrapper.h> struct sk_buff; struct dst_entry { union { struct net_device *dev; struct net_device __rcu *dev_rcu; }; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOCOUNT 0x0008 #define DST_FAKE_RTABLE 0x0010 #define DST_XFRM_TUNNEL 0x0020 #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops->check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ /* * __rcuref wants to be on a different cache line from * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT rcuref_t __rcuref; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT struct lwtunnel_state *lwtstate; rcuref_t __rcuref; /* 32-bit offset 64 */ #endif netdevice_tracker dev_tracker; /* * Used by rtable and rt6_info. Moves lwtstate into the next cache * line on 64bit so that lwtstate does not cause false sharing with * __rcuref under contention of __rcuref. This also puts the * frequently accessed members of rtable and rt6_info out of the * __rcuref cache line. */ struct list_head rt_uncached; struct uncached_list *rt_uncached_list; #ifdef CONFIG_64BIT struct lwtunnel_state *lwtstate; #endif }; struct dst_metrics { u32 metrics[RTAX_MAX]; refcount_t refcnt; } __aligned(4); /* Low pointer bits contain DST_METRICS_FLAGS */ extern const struct dst_metrics dst_default_metrics; u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); #define DST_METRICS_READ_ONLY 0x1UL #define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) static inline bool dst_metrics_read_only(const struct dst_entry *dst) { return dst->_metrics & DST_METRICS_READ_ONLY; } void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); static inline void dst_destroy_metrics_generic(struct dst_entry *dst) { unsigned long val = dst->_metrics; if (!(val & DST_METRICS_READ_ONLY)) __dst_destroy_metrics_generic(dst, val); } static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) { unsigned long p = dst->_metrics; BUG_ON(!p); if (p & DST_METRICS_READ_ONLY) return dst->ops->cow_metrics(dst, p); return __DST_METRICS_PTR(p); } /* This may only be invoked before the entry has reached global * visibility. */ static inline void dst_init_metrics(struct dst_entry *dst, const u32 *src_metrics, bool read_only) { dst->_metrics = ((unsigned long) src_metrics) | (read_only ? DST_METRICS_READ_ONLY : 0); } static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) { u32 *dst_metrics = dst_metrics_write_ptr(dest); if (dst_metrics) { u32 *src_metrics = DST_METRICS_PTR(src); memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); } } static inline u32 *dst_metrics_ptr(struct dst_entry *dst) { return DST_METRICS_PTR(dst); } static inline u32 dst_metric_raw(const struct dst_entry *dst, const int metric) { u32 *p = DST_METRICS_PTR(dst); return p[metric-1]; } static inline u32 dst_metric(const struct dst_entry *dst, const int metric) { WARN_ON_ONCE(metric == RTAX_HOPLIMIT || metric == RTAX_ADVMSS || metric == RTAX_MTU); return dst_metric_raw(dst, metric); } static inline u32 dst_metric_advmss(const struct dst_entry *dst) { u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); if (!advmss) advmss = dst->ops->default_advmss(dst); return advmss; } static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { u32 *p = dst_metrics_write_ptr(dst); if (p) p[metric-1] = val; } /* Kernel-internal feature bits that are unallocated in user space. */ #define DST_FEATURE_ECN_CA (1U << 31) #define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) #define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { return dst_metric(dst, RTAX_FEATURES) & feature; } INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* Variant of dst_mtu() for IPv4 users. */ static inline u32 dst4_mtu(const struct dst_entry *dst) { return INDIRECT_CALL_1(dst->ops->mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) { return msecs_to_jiffies(dst_metric(dst, metric)); } static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { return dst_metric(dst, RTAX_LOCK) & (1 << metric); } static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check * the placement of __rcuref in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); WARN_ON(!rcuref_get(&dst->__rcuref)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { if (unlikely(time != READ_ONCE(dst->lastuse))) { dst->__use++; WRITE_ONCE(dst->lastuse, time); } } static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) dst_hold(dst); return dst; } void dst_release(struct dst_entry *dst); void dst_release_immediate(struct dst_entry *dst); static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); } /** * skb_dst_drop - drops skb dst * @skb: buffer * * Drops dst reference count if a reference was taken. */ static inline void skb_dst_drop(struct sk_buff *skb) { if (skb->_skb_refdst) { refdst_drop(skb->_skb_refdst); skb->_skb_refdst = 0UL; } } static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { nskb->slow_gro |= !!refdst; nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); } static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) { __skb_dst_copy(nskb, oskb->_skb_refdst); } /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry * * This helper returns false if it could not safely * take a reference on a dst. */ static inline bool dst_hold_safe(struct dst_entry *dst) { return rcuref_get(&dst->__rcuref); } /** * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; skb->_skb_refdst = (unsigned long)dst; skb->slow_gro |= !!dst; } return skb->_skb_refdst != 0UL; } /** * __skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups. (no accounting done) */ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { skb->dev = dev; /* * Clear hash so that we can recalculate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ skb_clear_hash_if_not_l4(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, !net_eq(net, dev_net(dev))); } /** * skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups, and perform accounting. * Note: this accounting is not SMP safe. */ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { DEV_STATS_INC(dev, rx_packets); DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } static inline u32 dst_tclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID const struct dst_entry *dst; dst = skb_dst(skb); if (dst) return dst->tclassid; #endif return 0; } int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) { } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { struct neighbour *n; if (WARN_ON_ONCE(!dst->ops->neigh_lookup)) return NULL; n = dst->ops->neigh_lookup(dst, skb, NULL); return IS_ERR(n) ? NULL : n; } static inline void dst_confirm_neigh(const struct dst_entry *dst, const void *daddr) { if (dst->ops->confirm_neigh) dst->ops->confirm_neigh(dst, daddr); } static inline void dst_link_failure(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops && dst->ops->link_failure) dst->ops->link_failure(skb); } static inline void dst_set_expires(struct dst_entry *dst, int timeout) { unsigned long old, expires = jiffies + timeout; if (expires == 0) expires = 1; old = READ_ONCE(dst->expires); if (!old || time_before(expires, old)) WRITE_ONCE(dst->expires, expires); } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, struct sk_buff *skb) { if (likely(dst)) return LL_RESERVED_SPACE(dst->dev); return skb->mac_len; } INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output), ip6_output, ip_output, net, sk, skb); } INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input), ip6_input, ip_local_deliver, skb); } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (READ_ONCE(dst->obsolete)) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; } /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; #ifndef CONFIG_XFRM static inline struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct dst_entry * xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { return dst_orig; } static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return NULL; } #else struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id); struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); /* skb attached with this dst needs transformation if dst->xfrm is valid */ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return dst->xfrm; } #endif static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } /* update dst pmtu but not do neighbor confirm */ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } static inline struct net_device *dst_dev(const struct dst_entry *dst) { return READ_ONCE(dst->dev); } static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) { return rcu_dereference(dst->dev_rcu); } static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) { return dev_net_rcu(dst_dev_rcu(dst)); } static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) { return dst_dev(skb_dst(skb)); } static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) { return dst_dev_rcu(skb_dst(skb)); } static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) { return dev_net(skb_dst_dev(skb)); } static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) { return dev_net_rcu(skb_dst_dev_rcu(skb)); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); unsigned int dst_blackhole_mtu(const struct dst_entry *dst); #endif /* _NET_DST_H */
249 15 133 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H #include <linux/sched.h> #include <linux/path.h> #include <linux/spinlock.h> #include <linux/seqlock.h> struct fs_struct { int users; seqlock_t seq; int umask; int in_exec; struct path root, pwd; } __randomize_layout; extern struct kmem_cache *fs_cachep; extern void exit_fs(struct task_struct *); extern void set_fs_root(struct fs_struct *, const struct path *); extern void set_fs_pwd(struct fs_struct *, const struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void free_fs_struct(struct fs_struct *); extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { read_seqlock_excl(&fs->seq); *root = fs->root; path_get(root); read_sequnlock_excl(&fs->seq); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { read_seqlock_excl(&fs->seq); *pwd = fs->pwd; path_get(pwd); read_sequnlock_excl(&fs->seq); } extern bool current_chrooted(void); static inline int current_umask(void) { return current->fs->umask; } #endif /* _LINUX_FS_STRUCT_H */
3 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> #include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks"); module_param(enable_hooks, bool, 0000); unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); break; case NFPROTO_IPV6: seq_printf(s, "src=%pI6 dst=%pI6 ", tuple->src.u3.ip6, tuple->dst.u3.ip6); break; default: break; } switch (l4proto->l4proto) { case IPPROTO_ICMP: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_TCP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), ntohs(tuple->dst.u.udp.port)); break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), ntohs(tuple->dst.u.sctp.port)); break; case IPPROTO_ICMPV6: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_GRE: seq_printf(s, "srckey=0x%x dstkey=0x%x ", ntohs(tuple->src.u.gre.key), ntohs(tuple->dst.u.gre.key)); break; default: break; } } EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, struct ct_iter_state *st) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int i; for (i = st->bucket; i < st->htable_size; i++) { unsigned int skip = 0; restart: hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); struct hlist_nulls_node *tmp = n; if (!net_eq(net, nf_ct_net(ct))) continue; if (++skip <= st->skip_elems) continue; /* h should be returned, skip to nulls marker. */ while (!is_a_nulls(tmp)) tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); /* check if h is still linked to hash[i] */ if (get_nulls_value(tmp) != i) { skip = 0; goto restart; } st->skip_elems = skip; st->bucket = i; return h; } skip = 0; if (get_nulls_value(n) != i) goto restart; st->skip_elems = 0; } st->bucket = i; return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); if (*pos == 0) { st->skip_elems = 0; st->bucket = 0; } else if (st->skip_elems) { /* resume from last dumped entry */ st->skip_elems--; } return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct ct_iter_state *st = s->private; struct net *net = seq_file_net(s); (*pos)++; return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) __releases(RCU) { rcu_read_unlock(); } #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { struct lsm_context ctx; int ret; ret = security_secid_to_secctx(ct->secmark, &ctx); if (ret < 0) return; seq_printf(s, "secctx=%s ", ctx.context); security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { } #endif #ifdef CONFIG_NF_CONNTRACK_ZONES static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); if (zone->dir != dir) return; switch (zone->dir) { case NF_CT_DEFAULT_ZONE_DIR: seq_printf(s, "zone=%u ", zone->id); break; case NF_CT_ZONE_DIR_ORIG: seq_printf(s, "zone-orig=%u ", zone->id); break; case NF_CT_ZONE_DIR_REPL: seq_printf(s, "zone-reply=%u ", zone->id); break; default: break; } } #else static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { } #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { struct ct_iter_state *st = s->private; struct nf_conn_tstamp *tstamp; s64 delta_time; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { delta_time = st->time_now - tstamp->start; if (delta_time > 0) delta_time = div_s64(delta_time, NSEC_PER_SEC); else delta_time = 0; seq_printf(s, "delta-time=%llu ", (unsigned long long)delta_time); } return; } #else static inline void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { } #endif static const char* l3proto_name(u16 proto) { switch (proto) { case AF_INET: return "ipv4"; case AF_INET6: return "ipv6"; } return "unknown"; } static const char* l4proto_name(u16 proto) { switch (proto) { case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; case IPPROTO_ICMPV6: return "icmpv6"; } return "unknown"; } static void seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; struct nf_conn_counter *counter; acct = nf_conn_acct_find(ct); if (!acct) return; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); } /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; WARN_ON(!ct); if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { struct ct_iter_state *st = s->private; st->skip_elems--; nf_ct_kill(ct); goto release; } /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) goto release; if (!net_eq(nf_ct_net(ct), net)) goto release; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); if (seq_has_overflowed(s)) goto release; seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL); if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); seq_print_acct(s, ct, IP_CT_DIR_REPLY); if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; ret = 0; release: nf_ct_put(ct); return ret; } static const struct seq_operations ct_seq_ops = { .start = ct_seq_start, .next = ct_seq_next, .stop = ct_seq_stop, .show = ct_seq_show }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } return NULL; } static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } (*pos)++; return NULL; } static void ct_cpu_seq_stop(struct seq_file *seq, void *v) { } static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); const struct ip_conntrack_stat *st = v; unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } nr_conntracks = nf_conntrack_count(net); seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, st->clash_resolve, st->found, 0, st->invalid, 0, 0, st->chaintoolong, st->insert, st->insert_failed, st->drop, st->early_drop, st->error, st->expect_new, st->expect_create, st->expect_delete, st->search_restart ); return 0; } static const struct seq_operations ct_cpu_seq_ops = { .start = ct_cpu_seq_start, .next = ct_cpu_seq_next, .stop = ct_cpu_seq_stop, .show = ct_cpu_seq_show, }; static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; kuid_t root_uid; kgid_t root_gid; pde = proc_create_net("nf_conntrack", 0440, net->proc_net, &ct_seq_ops, sizeof(struct ct_iter_state)); if (!pde) goto out_nf_conntrack; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(pde, root_uid, root_gid); pde = proc_create_net("nf_conntrack", 0444, net->proc_net_stat, &ct_cpu_seq_ops, sizeof(struct seq_net_private)); if (!pde) goto out_stat_nf_conntrack; return 0; out_stat_nf_conntrack: remove_proc_entry("nf_conntrack", net->proc_net); out_nf_conntrack: return -ENOMEM; } static void nf_conntrack_standalone_fini_proc(struct net *net) { remove_proc_entry("nf_conntrack", net->proc_net_stat); remove_proc_entry("nf_conntrack", net->proc_net); } #else static int nf_conntrack_standalone_init_proc(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_proc(struct net *net) { } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ u32 nf_conntrack_count(const struct net *net) { const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_count); /* Sysctl support */ #ifdef CONFIG_SYSCTL /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* module_param hashsize could have changed value */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; /* update ret, we might not be able to satisfy request */ ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); /* update it to the actual value used by conntrack */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; return ret; } static int nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, i; ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; if (*(u8 *)table->data == 0) return 0; /* Load nf_log_syslog only if no logger is currently registered */ for (i = 0; i < NFPROTO_NUMPROTO; i++) { if (nf_log_is_registered(i)) return 0; } request_module("%s", "nf_log_syslog"); return 0; } static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { NF_SYSCTL_CT_MAX, NF_SYSCTL_CT_COUNT, NF_SYSCTL_CT_BUCKETS, NF_SYSCTL_CT_CHECKSUM, NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_SYSCTL_CT_TIMESTAMP, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif NF_SYSCTL_CT_LAST_SYSCTL, }; static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_BUCKETS] = { .procname = "nf_conntrack_buckets", .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = nf_conntrack_hash_sysctl, }, [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = nf_conntrack_log_invalid_sysctl, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { .procname = "nf_conntrack_generic_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT] = { .procname = "nf_conntrack_tcp_timeout_syn_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV] = { .procname = "nf_conntrack_tcp_timeout_syn_recv", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED] = { .procname = "nf_conntrack_tcp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT] = { .procname = "nf_conntrack_tcp_timeout_fin_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT] = { .procname = "nf_conntrack_tcp_timeout_close_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK] = { .procname = "nf_conntrack_tcp_timeout_last_ack", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT] = { .procname = "nf_conntrack_tcp_timeout_time_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE] = { .procname = "nf_conntrack_tcp_timeout_close", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS] = { .procname = "nf_conntrack_tcp_timeout_max_retrans", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK] = { .procname = "nf_conntrack_tcp_timeout_unacknowledged", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { .procname = "nf_flowtable_tcp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = { .procname = "nf_conntrack_tcp_ignore_invalid_rst", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM] = { .procname = "nf_conntrack_udp_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { .procname = "nf_flowtable_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6] = { .procname = "nf_conntrack_icmpv6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_NF_CT_PROTO_SCTP [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = { .procname = "nf_conntrack_sctp_timeout_closed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = { .procname = "nf_conntrack_sctp_timeout_cookie_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = { .procname = "nf_conntrack_sctp_timeout_cookie_echoed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED] = { .procname = "nf_conntrack_sctp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .procname = "nf_conntrack_sctp_timeout_shutdown_recd", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM] = { .procname = "nf_conntrack_gre_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif }; static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, struct ctl_table *table) { struct nf_tcp_net *tn = nf_tcp_pernet(net); #define XASSIGN(XNAME, tn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ ## XNAME].data = \ &(tn)->timeouts[TCP_CONNTRACK_ ## XNAME] XASSIGN(SYN_SENT, tn); XASSIGN(SYN_RECV, tn); XASSIGN(ESTABLISHED, tn); XASSIGN(FIN_WAIT, tn); XASSIGN(CLOSE_WAIT, tn); XASSIGN(LAST_ACK, tn); XASSIGN(TIME_WAIT, tn); XASSIGN(CLOSE, tn); XASSIGN(RETRANS, tn); XASSIGN(UNACK, tn); #undef XASSIGN #define XASSIGN(XNAME, rval) \ table[NF_SYSCTL_CT_PROTO_TCP_ ## XNAME].data = (rval) XASSIGN(LOOSE, &tn->tcp_loose); XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst); #undef XASSIGN #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; #endif } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net *sn = nf_sctp_pernet(net); #define XASSIGN(XNAME, sn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \ &(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME] XASSIGN(CLOSED, sn); XASSIGN(COOKIE_WAIT, sn); XASSIGN(COOKIE_ECHOED, sn); XASSIGN(ESTABLISHED, sn); XASSIGN(SHUTDOWN_SENT, sn); XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); #undef XASSIGN #endif } static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_gre_net *gn = nf_gre_pernet(net); table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE].data = &gn->timeouts[GRE_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM].data = &gn->timeouts[GRE_CT_REPLIED]; #endif } static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL); table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) return -ENOMEM; table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp; #endif table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; #endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_ct_sysctl_table)); if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; out_unregister_netfilter: kfree(table); return -ENOMEM; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else static int nf_conntrack_standalone_init_sysctl(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { } #endif /* CONFIG_SYSCTL */ static void nf_conntrack_fini_net(struct net *net) { if (enable_hooks) nf_ct_netns_put(net, NFPROTO_INET); nf_conntrack_standalone_fini_proc(net); nf_conntrack_standalone_fini_sysctl(net); } static int nf_conntrack_pernet_init(struct net *net) { int ret; net->ct.sysctl_checksum = 1; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) return ret; ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; ret = nf_conntrack_init_net(net); if (ret < 0) goto out_init_net; if (enable_hooks) { ret = nf_ct_netns_get(net, NFPROTO_INET); if (ret < 0) goto out_hooks; } return 0; out_hooks: nf_conntrack_cleanup_net(net); out_init_net: nf_conntrack_standalone_fini_proc(net); out_proc: nf_conntrack_standalone_fini_sysctl(net); return ret; } static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) nf_conntrack_fini_net(net); nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, .id = &nf_conntrack_net_id, .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) { int ret = nf_conntrack_init_start(); if (ret < 0) goto out_start; BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL nf_ct_netfilter_header = register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); if (!nf_ct_netfilter_header) { pr_err("nf_conntrack: can't register to sysctl.\n"); ret = -ENOMEM; goto out_sysctl; } nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif nf_conntrack_init_end(); ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; return 0; out_pernet: #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); out_sysctl: #endif nf_conntrack_cleanup_end(); out_start: return ret; } static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); #endif nf_conntrack_cleanup_end(); } module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini);
31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_SYNPROXY_H #define _NF_CONNTRACK_SYNPROXY_H #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netns/generic.h> struct nf_conn_synproxy { u32 isn; u32 its; u32 tsoff; }; static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY); #else return NULL; #endif } static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC); #else return NULL; #endif } static inline bool nf_ct_add_synproxy(struct nf_conn *ct, const struct nf_conn *tmpl) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) if (tmpl && nfct_synproxy(tmpl)) { if (!nfct_seqadj_ext_add(ct)) return false; if (!nfct_synproxy_ext_add(ct)) return false; } #endif return true; } #endif /* _NF_CONNTRACK_SYNPROXY_H */
104 100 4 5 100 7 132 132 131 132 4 4 8 8 8 8 8 8 18 18 18 18 18 18 15 15 15 15 50 15 1 14 3 1 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/kernel.h> #include <net/dst_metadata.h> #include <net/flow.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/inet_dscp.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { int err; struct socket *sock = NULL; struct sockaddr_in udp_addr; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; } sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL(udp_sock_create4); static bool sk_saddr_any(struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #else return !sk->sk_rcv_saddr; #endif } void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { struct sock *sk = sock->sk; /* Disable multicast loopback */ inet_clear_bit(MC_LOOP, sk); /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ inet_inc_convert_csum(sk); rcu_assign_sk_user_data(sk, cfg->sk_user_data); udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); udp_tunnel_update_gro_rcv(sk, true); if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && sk->sk_kern_sock) udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_add_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_del_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ASSERT_RTNL(); ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; for_each_netdev(net, dev) { udp_tunnel_nic_lock(dev); udp_tunnel_nic_add_port(dev, &ti); udp_tunnel_nic_unlock(dev); } } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); /* Notify netdevs that UDP port is no more listening */ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ASSERT_RTNL(); ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; for_each_netdev(net, dev) { udp_tunnel_nic_lock(dev); udp_tunnel_nic_del_port(dev, &ti); udp_tunnel_nic_unlock(dev); } } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck, u16 ipcb_flags) { struct udphdr *uh; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, ipcb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; if (family == AF_INET) tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); else tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, int oif, __be32 *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 tos, struct dst_cache *dst_cache) { struct rtable *rt = NULL; struct flowi4 fl4; #ifdef CONFIG_DST_CACHE if (dst_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; } #endif memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; fl4.flowi4_oif = oif; fl4.daddr = key->u.ipv4.dst; fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); return ERR_PTR(-ENETUNREACH); } if (rt->dst.dev == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); ip_rt_put(rt); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (dst_cache) dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); #endif *saddr = fl4.saddr; return rt; } EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup); MODULE_DESCRIPTION("IPv4 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL");
781 779 781 780 776 776 598 596 547 597 596 253 597 597 597 597 598 21 18 21 1 21 253 252 253 21 21 12 18 13 8 12 13 12 12 12 10 12 16 16 16 21 18 18 18 10 12 7 12 18 18 18 18 17 13 18 4 4 3 1 4 9 9 19 9 8 1 9 598 713 774 597 596 505 597 596 598 596 505 597 772 773 670 776 12 12 12 12 590 501 590 500 16 501 14 499 591 598 596 596 591 593 596 597 108 96 107 107 107 547 108 598 549 596 598 597 597 598 598 596 252 253 763 762 253 253 252 253 251 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 // SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ #include <linux/init.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> #include <linux/lockdep.h> #include "internal.h" #define list_for_each_table_entry(entry, header) \ entry = header->ctl_table; \ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* * Support for permanently empty directories. * Must be non-empty to avoid sharing an address with other tables. */ static const struct ctl_table sysctl_mount_point[] = { { } }; /** * register_sysctl_mount_point() - registers a sysctl mount point * @path: path for the mount point * * Used to create a permanently empty directory to serve as mount point. * There are some subtle but important permission checks this allows in the * case of unprivileged mounts. */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); #define sysctl_is_perm_empty_ctl_header(hptr) \ (hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_set_perm_empty_ctl_header(hptr) \ (hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_clear_perm_empty_ctl_header(hptr) \ (hptr->type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) return; atomic_inc(&poll->event); wake_up_interruptible(&poll->wait); } static const struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table }}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }, }; static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, const struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) { int cmp; cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; } static const struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; lockdep_assert_held(&sysctl_lock); while (node) { struct ctl_node *ctl_node; const char *procname; int cmp; ctl_node = rb_entry(node, struct ctl_node, node); head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; procname = entry->procname; cmp = namecmp(name, namelen, procname, strlen(procname)); if (cmp < 0) node = node->rb_left; else if (cmp > 0) node = node->rb_right; else { *phead = head; return entry; } } return NULL; } static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; struct rb_node *parent = NULL; const char *name = entry->procname; int namelen = strlen(name); while (*p) { struct ctl_table_header *parent_head; const struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; parent = *p; parent_node = rb_entry(parent, struct ctl_node, node); parent_head = parent_node->header; parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; parent_name = parent_entry->procname; cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); pr_cont("%s\n", entry->procname); return -EEXIST; } } rb_link_node(node, parent, p); rb_insert_color(node, &head->parent->root); return 0; } static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; rb_erase(node, &head->parent->root); } static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_node *node, const struct ctl_table *table, size_t table_size) { head->ctl_table = table; head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; head->nreg = 1; head->unregistering = NULL; head->root = root; head->set = set; head->parent = NULL; head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { const struct ctl_table *entry; list_for_each_table_entry(entry, head) { node->header = head; node++; } } if (table == sysctl_mount_point) sysctl_set_perm_empty_ctl_header(head); } static void erase_header(struct ctl_table_header *head) { const struct ctl_table *entry; list_for_each_table_entry(entry, head) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { const struct ctl_table *entry; struct ctl_table_header *dir_h = &dir->header; int err; /* Is this a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(header)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; sysctl_set_perm_empty_ctl_header(dir_h); } dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; } return 0; fail: erase_header(header); put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; drop_sysctl_table(dir_h); return err; } static int use_table(struct ctl_table_header *p) { lockdep_assert_held(&sysctl_lock); if (unlikely(p->unregistering)) return 0; p->used++; return 1; } static void unuse_table(struct ctl_table_header *p) { lockdep_assert_held(&sysctl_lock); if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); } static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } static void start_unregistering(struct ctl_table_header *p) { /* will reacquire if has to wait */ lockdep_assert_held(&sysctl_lock); /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock */ if (unlikely(p->used)) { struct completion wait; init_completion(&wait); p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); spin_unlock(&sysctl_lock); } /* * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ spin_lock(&sysctl_lock); erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); spin_unlock(&sysctl_lock); return head; } static void sysctl_head_finish(struct ctl_table_header *head) { if (!head) return; spin_lock(&sysctl_lock); unuse_table(head); spin_unlock(&sysctl_lock); } static struct ctl_table_set * lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) set = root->lookup(root); return set; } static const struct ctl_table *lookup_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); if (entry && use_table(head)) *phead = head; else entry = NULL; spin_unlock(&sysctl_lock); return entry; } static struct ctl_node *first_usable_entry(struct rb_node *node) { struct ctl_node *ctl_node; for (;node; node = rb_next(node)) { ctl_node = rb_entry(node, struct ctl_node, node); if (use_table(ctl_node->header)) return ctl_node; } return NULL; } static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = NULL; const struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); ctl_node = first_usable_entry(rb_first(&dir->root)); spin_unlock(&sysctl_lock); if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = *phead; const struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); unuse_table(head); ctl_node = first_usable_entry(rb_next(&ctl_node->node)); spin_unlock(&sysctl_lock); head = NULL; if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ static int test_perm(int mode, int op) { if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; if (root->permissions) mode = root->permissions(head, table); else mode = table->mode; return test_perm(mode, op); } static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, const struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); ei = PROC_I(inode); spin_lock(&sysctl_lock); if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) root->set_ownership(head, &inode->i_uid, &inode->i_gid); return inode; } void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) head = &sysctl_table_root.default_set.dir.header; return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; const struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; int ret; if (IS_ERR(head)) return ERR_CAST(head); ctl_dir = container_of(head, struct ctl_dir, header); p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; if (S_ISLNK(p->mode)) { ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; } inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations); out: if (h) sysctl_head_finish(h); sysctl_head_finish(head); return err; } static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, int write) { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; if (IS_ERR(head)) return PTR_ERR(head); /* * At this point we know that the sysctl was not unregistered * and won't be until we finish. */ error = -EPERM; if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ error = -EINVAL; if (!table->proc_handler) goto out; /* don't even try if the size is too large */ error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; if (write) { error = -EFAULT; if (!copy_from_iter_full(kbuf, count, iter)) goto out_free_buf; kbuf[count] = '\0'; } error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; /* careful: calling conventions are nasty here */ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; if (!write) { error = -EFAULT; if (copy_to_iter(kbuf, count, iter) < count) goto out_free_buf; } error = count; out_free_buf: kvfree(kbuf); out: sysctl_head_finish(head); return error; } static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 0); } static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) return PTR_ERR(head); if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); sysctl_head_finish(head); return 0; } static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ if (IS_ERR(head)) return EPOLLERR | EPOLLHUP; if (!table->proc_handler) goto out; if (!table->poll) goto out; event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { filp->private_data = proc_sys_poll_event(table->poll); ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } out: sysctl_head_finish(head); return ret; } static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, const struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; unsigned type = DT_UNKNOWN; qname.name = table->procname; qname.len = strlen(table->procname); qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); res = d_splice_alias_ops(inode, child, &proc_sys_dentry_operations); d_lookup_done(child); if (unlikely(res)) { dput(child); if (IS_ERR(res)) return false; child = res; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); return dir_emit(ctx, qname.name, qname.len, ino, type); } static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, const struct ctl_table *table) { bool ret = true; head = sysctl_head_grab(head); if (IS_ERR(head)) return false; /* It is not an error if we can not follow the link ignore it */ if (sysctl_follow_link(&head, &table)) goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; } static int scan(struct ctl_table_header *head, const struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { bool res; if ((*pos)++ < ctx->pos) return true; if (unlikely(S_ISLNK(table->mode))) res = proc_sys_link_fill_cache(file, ctx, head, table); else res = proc_sys_fill_cache(file, ctx, head, table); if (res) ctx->pos = *pos; return res; } static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; const struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) goto out; pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } out: sysctl_head_finish(head); return 0; } static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; const struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; head = grab_header(inode); if (IS_ERR(head)) return PTR_ERR(head); table = PROC_I(inode)->sysctl_entry; if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; } static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); const struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; sysctl_head_finish(head); return 0; } static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, .iterate_shared = proc_sys_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_sys_inode_operations = { .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static const struct inode_operations proc_sys_dir_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static int proc_sys_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) { struct ctl_table_set *set = p->set; int res; spin_lock(&sysctl_lock); if (p->unregistering) res = 0; else if (!set->is_seen) res = 1; else res = set->is_seen(set); spin_unlock(&sysctl_lock); return res; } static int proc_sys_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; struct inode *inode; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; // false positive is fine here - we'll recheck anyway if (d_in_lookup(dentry)) return 0; inode = d_inode_rcu(dentry); // we just might have run into dentry in the middle of __dentry_kill() if (!inode) return 1; head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { .d_revalidate = proc_sys_revalidate, .d_delete = proc_sys_delete, .d_compare = proc_sys_compare, }; static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; const struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); if (!S_ISDIR(entry->mode)) return ERR_PTR(-ENOTDIR); return container_of(head, struct ctl_dir, header); } static struct ctl_dir *new_dir(struct ctl_table_set *set, const char *name, int namelen) { struct ctl_table *table; struct ctl_dir *new; struct ctl_node *node; char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + sizeof(struct ctl_table) + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 1); memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } /** * get_subdir - find or create a subdir with the specified name. * @dir: Directory to create the subdirectory in * @name: The name of the subdirectory to find or create * @namelen: The length of name * * Takes a directory with an elevated reference count so we know that * if we drop the lock the directory will not go away. Upon success * the reference is moved from @dir to the returned subdirectory. * Upon error an error code is returned and the reference on @dir is * simply dropped. */ static struct ctl_dir *get_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; int err; spin_lock(&sysctl_lock); subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; spin_unlock(&sysctl_lock); new = new_dir(set, name, namelen); spin_lock(&sysctl_lock); subdir = ERR_PTR(-ENOMEM); if (!new) goto failed; /* Was the subdir added while we dropped the lock? */ subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; /* Nope. Use the our freshly made directory entry. */ err = insert_header(dir, &new->header); subdir = ERR_PTR(err); if (err) goto failed; subdir = new; found: subdir->header.nreg++; failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) drop_sysctl_table(&new->header); spin_unlock(&sysctl_lock); return subdir; } static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) { struct ctl_dir *parent; const char *procname; if (!dir->header.parent) return &set->dir; parent = xlate_dir(set, dir->header.parent); if (IS_ERR(parent)) return parent; procname = dir->header.ctl_table[0].procname; return find_subdir(parent, procname, strlen(procname)); } static int sysctl_follow_link(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head; const struct ctl_table *entry; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_dir *dir; int ret; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); else { const char *procname = (*pentry)->procname; head = NULL; entry = find_entry(&head, dir, procname, strlen(procname)); ret = -ENOENT; if (entry && use_table(head)) { unuse_table(*phead); *phead = head; *pentry = entry; ret = 0; } } spin_unlock(&sysctl_lock); return ret; } static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("sysctl table check failed: %s/%s %pV\n", path, table->procname, &vaf); va_end(args); return -EINVAL; } static int sysctl_check_table_array(const char *path, const struct ctl_table *table) { unsigned int extra; int err = 0; if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array not allowed"); } if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); if (table->extra1) { extra = *(unsigned int *) table->extra1; if (extra > 255U) err |= sysctl_err(path, table, "range value too large for proc_dou8vec_minmax"); } if (table->extra2) { extra = *(unsigned int *) table->extra2; if (extra > 255U) err |= sysctl_err(path, table, "range value too large for proc_dou8vec_minmax"); } } if (table->proc_handler == proc_dobool) { if (table->maxlen != sizeof(bool)) err |= sysctl_err(path, table, "array not allowed"); } return err; } static int sysctl_check_table(const char *path, struct ctl_table_header *header) { const struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, header) { if (!entry->procname) err |= sysctl_err(path, entry, "procname is null"); if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || (entry->proc_handler == proc_dointvec_minmax) || (entry->proc_handler == proc_dou8vec_minmax) || (entry->proc_handler == proc_dointvec_jiffies) || (entry->proc_handler == proc_dointvec_userhz_jiffies) || (entry->proc_handler == proc_dointvec_ms_jiffies) || (entry->proc_handler == proc_doulongvec_minmax) || (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { if (!entry->data) err |= sysctl_err(path, entry, "No data"); if (!entry->maxlen) err |= sysctl_err(path, entry, "No maxlen"); else err |= sysctl_check_table_array(path, entry); } if (!entry->proc_handler) err |= sysctl_err(path, entry, "No proc_handler"); if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) err |= sysctl_err(path, entry, "bogus .mode 0%o", entry->mode); } return err; } static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { struct ctl_table *link_table, *link; struct ctl_table_header *links; const struct ctl_table *entry; struct ctl_node *node; char *link_name; int name_bytes; name_bytes = 0; list_for_each_table_entry(entry, head) { name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*head->ctl_table_size + sizeof(struct ctl_table)*head->ctl_table_size + name_bytes, GFP_KERNEL); if (!links) return NULL; node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + head->ctl_table_size); link_name = (char *)(link_table + head->ctl_table_size); link = link_table; list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = head->root; link_name += len; link++; } init_header(links, dir->header.root, dir->header.set, node, link_table, head->ctl_table_size); links->nreg = head->ctl_table_size; return links; } static bool get_links(struct ctl_dir *dir, struct ctl_table_header *header, struct ctl_table_root *link_root) { struct ctl_table_header *tmp_head; const struct ctl_table *entry, *link; if (header->ctl_table_size == 0 || sysctl_is_perm_empty_ctl_header(header)) return true; /* Are there links available for every entry in table? */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) continue; if (S_ISLNK(link->mode) && (link->data == link_root)) continue; return false; } /* The checks passed. Increase the registration count on the links */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); tmp_head->nreg++; } return true; } static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_dir *core_parent; struct ctl_table_header *links; int err; if (head->set == root_set) return 0; core_parent = xlate_dir(root_set, head->parent); if (IS_ERR(core_parent)) return 0; if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; if (!links) goto out; err = 0; if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } err = insert_header(core_parent, links); if (err) kfree(links); out: drop_sysctl_table(&core_parent->header); return err; } /* Find the directory for the ctl_table. If one is not found create it. */ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) { const char *name, *nextname; for (name = path; name; name = nextname) { int namelen; nextname = strchr(name, '/'); if (nextname) { namelen = nextname - name; nextname++; } else { namelen = strlen(name); } if (namelen == 0) continue; /* * namelen ensures if name is "foo/bar/yay" only foo is * registered first. We traverse as if using mkdir -p and * return a ctl_dir for the last directory entry. */ dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) break; } return dir; } /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * * @table: the top-level table structure. This table should not be free'd * after registration. So it should not be used on stack. It can either * be a global or dynamically allocated by the caller and free'd later * after sysctl unregistration. * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. * * The members of the &struct ctl_table structure are used as follows: * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * data - a pointer to data for use by proc_handler * maxlen - the maximum size in bytes of the data * mode - the file permissions for the /proc/sys file * type - Defines the target type (described in struct definition) * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines * XXX: we should eventually modify these to use long min / max [0] * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes are not allowed. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - * * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() * * It is the handler's job to read the input buffer from user memory * and process it. The handler should return 0 on success. * * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, const struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; struct ctl_dir *dir; struct ctl_node *node; header = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); init_header(header, root, set, node, table, table_size); if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); dir = sysctl_mkdir_p(dir, path); if (IS_ERR(dir)) goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: kfree(header); return NULL; } /** * register_sysctl_sz - register a sysctl table * @path: The path to the directory the sysctl table is in. If the path * doesn't exist we will create it for you. * @table: the table structure. The calller must ensure the life of the @table * will be kept during the lifetime use of the syctl. It must not be freed * until unregister_sysctl_table() is called with the given returned table * with this registration. If your code is non modular then you don't need * to call unregister_sysctl_table() and can instead use something like * register_sysctl_init() which does not care for the result of the syctl * registration. * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, path, table, table_size); } EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path * @path: path name for sysctl base. If that path doesn't exist we will create * it for you. * @table: This is the sysctl table that needs to be registered to the path. * The caller must ensure the life of the @table will be kept during the * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default * values pre-set. Code which depends on these variables will always work even * if register_sysctl() fails. If register_sysctl() fails you'd just loose the * ability to query or modify the sysctls dynamically at run time. Chances of * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, const struct ctl_table *table, const char *table_name, size_t table_size) { struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); } static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; const struct ctl_table *entry; if (header->set == root_set) return; core_parent = xlate_dir(root_set, parent); if (IS_ERR(core_parent)) return; list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; const struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); if (link && ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || (S_ISLNK(link->mode) && (link->data == root)))) { drop_sysctl_table(link_head); } else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); pr_cont("%s\n", name); } } } static void drop_sysctl_table(struct ctl_table_header *header) { struct ctl_dir *parent = header->parent; if (--header->nreg) return; if (parent) { put_links(header); start_unregistering(header); } if (!--header->count) kfree_rcu(header, rcu); if (parent) drop_sysctl_table(&parent->header); } /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { might_sleep(); if (header == NULL) return; spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); } EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) { WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); } int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init_bases(); } struct sysctl_alias { const char *kernel_param; const char *sysctl_param; }; /* * Historically some settings had both sysctl and a command line parameter. * With the generic sysctl. parameter support, we can handle them at a single * place and only keep the historical name for compatibility. This is not meant * to add brand new aliases. When adding existing aliases, consider whether * the possibly different moment of changing the value (e.g. from early_param * to the moment do_sysctl_args() is called) is an issue for the specific * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, { } }; static const char *sysctl_find_alias(char *param) { const struct sysctl_alias *alias; for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { if (strcmp(alias->kernel_param, param) == 0) return alias->sysctl_param; } return NULL; } bool sysctl_is_alias(char *param) { const char *alias = sysctl_find_alias(param); return alias != NULL; } /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) { char *path; struct vfsmount **proc_mnt = arg; struct file_system_type *proc_fs_type; struct file *file; int len; int err; loff_t pos = 0; ssize_t wret; if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; if (param[0] != '/' && param[0] != '.') return 0; param++; } else { param = (char *) sysctl_find_alias(param); if (!param) return 0; } if (!val) return -EINVAL; len = strlen(val); if (len == 0) return -EINVAL; /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no * options were given, we mount it only when the first sysctl option is * found. Why not a persistent mount? There are problems with a * persistent mount of proc in that it forces userspace not to use any * proc mount options. */ if (!*proc_mnt) { proc_fs_type = get_fs_type("proc"); if (!proc_fs_type) { pr_err("Failed to find procfs to set sysctl from command line\n"); return 0; } *proc_mnt = kern_mount(proc_fs_type); put_filesystem(proc_fs_type); if (IS_ERR(*proc_mnt)) { pr_err("Failed to mount procfs to set sysctl from command line\n"); return 0; } } path = kasprintf(GFP_KERNEL, "sys/%s", param); if (!path) panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", param, val); else if (err == -EACCES) pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", param, val); else pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", file, param, val); goto out; } wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; if (err == -EINVAL) pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", param, val); else pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", ERR_PTR(err), param, val); } else if (wret != len) { pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", wret, len, path, param, val); } err = filp_close(file, NULL); if (err) pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", ERR_PTR(err), param, val); out: kfree(path); return 0; } void do_sysctl_args(void) { char *command_line; struct vfsmount *proc_mnt = NULL; command_line = kstrdup(saved_command_line, GFP_KERNEL); if (!command_line) panic("%s: Failed to allocate copy of command line\n", __func__); parse_args("Setting sysctl args", command_line, NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); if (proc_mnt) kern_unmount(proc_mnt); kfree(command_line); }
4452 13711 817 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM skb #if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SKB_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #undef FN #define FN(reason) TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason); DEFINE_DROP_REASON(FN, FN) #undef FN #undef FNe #define FN(reason) { SKB_DROP_REASON_##reason, #reason }, #define FNe(reason) { SKB_DROP_REASON_##reason, #reason } /* * Tracepoint for free an sk_buff: */ TRACE_EVENT(kfree_skb, TP_PROTO(struct sk_buff *skb, void *location, enum skb_drop_reason reason, struct sock *rx_sk), TP_ARGS(skb, location, reason, rx_sk), TP_STRUCT__entry( __field(void *, skbaddr) __field(void *, location) __field(void *, rx_sk) __field(unsigned short, protocol) __field(enum skb_drop_reason, reason) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; __entry->rx_sk = rx_sk; __entry->protocol = ntohs(skb->protocol); __entry->reason = reason; ), TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s", __entry->skbaddr, __entry->rx_sk, __entry->protocol, __entry->location, __print_symbolic(__entry->reason, DEFINE_DROP_REASON(FN, FNe))) ); #undef FN #undef FNe TRACE_EVENT(consume_skb, TP_PROTO(struct sk_buff *skb, void *location), TP_ARGS(skb, location), TP_STRUCT__entry( __field( void *, skbaddr) __field( void *, location) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; ), TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location) ); TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), TP_ARGS(skb, len), TP_STRUCT__entry( __field( const void *, skbaddr ) __field( int, len ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = len; ), TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len) ); #endif /* _TRACE_SKB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0+ /* * IMA support for appraising module-style appended signatures. * * Copyright (C) 2019 IBM Corporation * * Author: * Thiago Jung Bauermann <bauerman@linux.ibm.com> */ #include <linux/types.h> #include <linux/module_signature.h> #include <keys/asymmetric-type.h> #include <crypto/pkcs7.h> #include "ima.h" struct modsig { struct pkcs7_message *pkcs7_msg; enum hash_algo hash_algo; /* This digest will go in the 'd-modsig' field of the IMA template. */ const u8 *digest; u32 digest_size; /* * This is what will go to the measurement list if the template requires * storing the signature. */ int raw_pkcs7_len; u8 raw_pkcs7[] __counted_by(raw_pkcs7_len); }; /* * ima_read_modsig - Read modsig from buf. * * Return: 0 on success, error code otherwise. */ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { const size_t marker_len = strlen(MODULE_SIG_STRING); const struct module_signature *sig; struct modsig *hdr; size_t sig_len; const void *p; int rc; if (buf_len <= marker_len + sizeof(*sig)) return -ENOENT; p = buf + buf_len - marker_len; if (memcmp(p, MODULE_SIG_STRING, marker_len)) return -ENOENT; buf_len -= marker_len; sig = (const struct module_signature *)(p - sizeof(*sig)); rc = mod_check_sig(sig, buf_len, func_tokens[func]); if (rc) return rc; sig_len = be32_to_cpu(sig->sig_len); buf_len -= sig_len + sizeof(*sig); /* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */ hdr = kzalloc_flex(*hdr, raw_pkcs7, sig_len); if (!hdr) return -ENOMEM; hdr->raw_pkcs7_len = sig_len; hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); if (IS_ERR(hdr->pkcs7_msg)) { rc = PTR_ERR(hdr->pkcs7_msg); kfree(hdr); return rc; } memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); /* We don't know the hash algorithm yet. */ hdr->hash_algo = HASH_ALGO__LAST; *modsig = hdr; return 0; } /** * ima_collect_modsig - Calculate the file hash without the appended signature. * @modsig: parsed module signature * @buf: data to verify the signature on * @size: data size * * Since the modsig is part of the file contents, the hash used in its signature * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code * calculates a separate one for signature verification. */ void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { int rc; /* * Provide the file contents (minus the appended sig) so that the PKCS7 * code can calculate the file hash. */ size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) + sizeof(struct module_signature); rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size); if (rc) return; /* Ask the PKCS7 code to calculate the file hash. */ rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest, &modsig->digest_size, &modsig->hash_algo); } int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) { return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { *algo = modsig->hash_algo; *digest = modsig->digest; *digest_size = modsig->digest_size; return 0; } int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { *data = &modsig->raw_pkcs7; *data_len = modsig->raw_pkcs7_len; return 0; } void ima_free_modsig(struct modsig *modsig) { if (!modsig) return; pkcs7_free_message(modsig->pkcs7_msg); kfree(modsig); }
4022 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SHA-256 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha256_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha256_block_state *state, const u8 *data, \ size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha256_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3); DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx); DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx); DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform); static void sha256_blocks(struct sha256_block_state *state, const u8 *data, size_t nblocks) { static_call(sha256_blocks_x86)(state, data, nblocks); } static_assert(offsetof(struct __sha256_ctx, state) == 0); static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); static_assert(offsetof(struct __sha256_ctx, buf) == 40); asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, int len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]); #define sha256_finup_2x_arch sha256_finup_2x_arch static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { /* * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. * Further limit len to 65536 to avoid spending too long with preemption * disabled. (Of course, in practice len is nearly always 4096 anyway.) */ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && len <= 65536 && likely(irq_fpu_usable())) { kernel_fpu_begin(); sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); kernel_fpu_end(); kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); return true; } return false; } static bool sha256_finup_2x_is_optimized_arch(void) { return static_key_enabled(&have_sha_ni); } #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, sha256_blocks_avx2); else static_call_update(sha256_blocks_x86, sha256_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha256_blocks_x86, sha256_blocks_ssse3); } }
1480 273 35 33 974 122 4 3169 2872 92 3133 122 3169 585 349 349 163 5 127 89 721 752 725 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); /* * This function returns the first node (in sort order) of the tree. */ static inline struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } /* * This function returns the last node (in sort order) of the tree. */ static inline struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) { bool leftmost = true; struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) { link = &parent->rb_left; } else if (c > 0) { link = &parent->rb_right; leftmost = false; } else { return parent; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return NULL; } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find_add_rcu() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Adds a Store-Release for link_node. * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_rcu() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Notably, tree descent vs concurrent tree rotations is unsound and can result * in false-negatives. * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find_rcu(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
1127 1088 56 731 1088 54 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright Amazon.com Inc. or its affiliates. */ #include <linux/init.h> #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/net_namespace.h> #include <net/netdev_lock.h> #include <net/netns/generic.h> int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); enum netdev_cmd cmd = event; /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { case NETDEV_XDP_FEAT_CHANGE: netdev_assert_locked(dev); fallthrough; case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_UP: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: case NETDEV_REBOOT: case NETDEV_UNREGISTER: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_PRE_CHANGEADDR: case NETDEV_GOING_DOWN: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_PRE_UP: case NETDEV_PRE_TYPE_CHANGE: case NETDEV_POST_TYPE_CHANGE: case NETDEV_POST_INIT: case NETDEV_PRE_UNINIT: case NETDEV_RELEASE: case NETDEV_NOTIFY_PEERS: case NETDEV_JOIN: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_PRECHANGEMTU: case NETDEV_CHANGEINFODATA: case NETDEV_BONDING_INFO: case NETDEV_PRECHANGEUPPER: case NETDEV_CHANGELOWERSTATE: case NETDEV_UDP_TUNNEL_PUSH_INFO: case NETDEV_UDP_TUNNEL_DROP_INFO: case NETDEV_CHANGE_TX_QUEUE_LEN: case NETDEV_CVLAN_FILTER_PUSH_INFO: case NETDEV_CVLAN_FILTER_DROP_INFO: case NETDEV_SVLAN_FILTER_PUSH_INFO: case NETDEV_SVLAN_FILTER_DROP_INFO: case NETDEV_OFFLOAD_XSTATS_ENABLE: case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: ASSERT_RTNL(); break; case NETDEV_CHANGENAME: ASSERT_RTNL_NET(net); break; } return NOTIFY_DONE; } EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); static int rtnl_net_debug_net_id; static int __net_init rtnl_net_debug_net_init(struct net *net) { struct notifier_block *nb; nb = net_generic(net, rtnl_net_debug_net_id); nb->notifier_call = netdev_debug_event; return register_netdevice_notifier_net(net, nb); } static void __net_exit rtnl_net_debug_net_exit(struct net *net) { struct notifier_block *nb; nb = net_generic(net, rtnl_net_debug_net_id); unregister_netdevice_notifier_net(net, nb); } static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { .init = rtnl_net_debug_net_init, .exit = rtnl_net_debug_net_exit, .id = &rtnl_net_debug_net_id, .size = sizeof(struct notifier_block), }; static struct notifier_block rtnl_net_debug_block = { .notifier_call = netdev_debug_event, }; static int __init rtnl_net_debug_init(void) { int ret; ret = register_pernet_subsys(&rtnl_net_debug_net_ops); if (ret) return ret; ret = register_netdevice_notifier(&rtnl_net_debug_block); if (ret) unregister_pernet_subsys(&rtnl_net_debug_net_ops); return ret; } subsys_initcall(rtnl_net_debug_init);
33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/buildid.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ (BUILD_ID_SIZE_MAX * 2) + 1) struct cred; struct module; static inline int is_kernel_text(unsigned long addr) { if (__is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (__is_kernel(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; guard(rcu)(); mod = __module_address((unsigned long)ptr); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #ifdef CONFIG_KALLSYMS unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data); int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_build_id(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); extern int sprint_backtrace_build_id(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_build_id(char *buffer, unsigned long address) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { return -EOPNOTSUPP; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/
18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 // SPDX-License-Identifier: GPL-2.0-or-later /* X.509 certificate parser * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "X.509: "fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/oid_registry.h> #include <crypto/public_key.h> #include "x509_parser.h" #include "x509.asn1.h" #include "x509_akid.asn1.h" struct x509_parse_context { struct x509_certificate *cert; /* Certificate being constructed */ unsigned long data; /* Start of data */ const void *key; /* Key data */ size_t key_size; /* Size of key data */ const void *params; /* Key parameters */ size_t params_size; /* Size of key parameters */ enum OID key_algo; /* Algorithm used by the cert's key */ enum OID last_oid; /* Last OID encountered */ enum OID sig_algo; /* Algorithm used to sign the cert */ u8 o_size; /* Size of organizationName (O) */ u8 cn_size; /* Size of commonName (CN) */ u8 email_size; /* Size of emailAddress */ u16 o_offset; /* Offset of organizationName (O) */ u16 cn_offset; /* Offset of commonName (CN) */ u16 email_offset; /* Offset of emailAddress */ unsigned raw_akid_size; const void *raw_akid; /* Raw authorityKeyId in ASN.1 */ const void *akid_raw_issuer; /* Raw directoryName in authorityKeyId */ unsigned akid_raw_issuer_size; }; /* * Free an X.509 certificate */ void x509_free_certificate(struct x509_certificate *cert) { if (cert) { public_key_free(cert->pub); public_key_signature_free(cert->sig); kfree(cert->issuer); kfree(cert->subject); kfree(cert->id); kfree(cert->skid); kfree(cert); } } EXPORT_SYMBOL_GPL(x509_free_certificate); /* * Parse an X.509 certificate */ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen) { struct x509_certificate *cert __free(x509_free_certificate) = NULL; struct x509_parse_context *ctx __free(kfree) = NULL; struct asymmetric_key_id *kid; long ret; cert = kzalloc_obj(struct x509_certificate); if (!cert) return ERR_PTR(-ENOMEM); cert->pub = kzalloc_obj(struct public_key); if (!cert->pub) return ERR_PTR(-ENOMEM); cert->sig = kzalloc_obj(struct public_key_signature); if (!cert->sig) return ERR_PTR(-ENOMEM); ctx = kzalloc_obj(struct x509_parse_context); if (!ctx) return ERR_PTR(-ENOMEM); ctx->cert = cert; ctx->data = (unsigned long)data; /* Attempt to decode the certificate */ ret = asn1_ber_decoder(&x509_decoder, ctx, data, datalen); if (ret < 0) return ERR_PTR(ret); /* Decode the AuthorityKeyIdentifier */ if (ctx->raw_akid) { pr_devel("AKID: %u %*phN\n", ctx->raw_akid_size, ctx->raw_akid_size, ctx->raw_akid); ret = asn1_ber_decoder(&x509_akid_decoder, ctx, ctx->raw_akid, ctx->raw_akid_size); if (ret < 0) { pr_warn("Couldn't decode AuthKeyIdentifier\n"); return ERR_PTR(ret); } } cert->pub->key = kmemdup(ctx->key, ctx->key_size, GFP_KERNEL); if (!cert->pub->key) return ERR_PTR(-ENOMEM); cert->pub->keylen = ctx->key_size; cert->pub->params = kmemdup(ctx->params, ctx->params_size, GFP_KERNEL); if (!cert->pub->params) return ERR_PTR(-ENOMEM); cert->pub->paramlen = ctx->params_size; cert->pub->algo = ctx->key_algo; /* Grab the signature bits */ ret = x509_get_sig_params(cert); if (ret < 0) return ERR_PTR(ret); /* Generate cert issuer + serial number key ID */ kid = asymmetric_key_generate_id(cert->raw_serial, cert->raw_serial_size, cert->raw_issuer, cert->raw_issuer_size); if (IS_ERR(kid)) return ERR_CAST(kid); cert->id = kid; /* Detect self-signed certificates */ ret = x509_check_for_self_signed(cert); if (ret < 0) return ERR_PTR(ret); return_ptr(cert); } EXPORT_SYMBOL_GPL(x509_cert_parse); /* * Note an OID when we find one for later processing when we know how * to interpret it. */ int x509_note_OID(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; ctx->last_oid = look_up_OID(value, vlen); if (ctx->last_oid == OID__NR) { char buffer[50]; sprint_oid(value, vlen, buffer, sizeof(buffer)); pr_debug("Unknown OID: [%lu] %s\n", (unsigned long)value - ctx->data, buffer); } return 0; } /* * Save the position of the TBS data so that we can check the signature over it * later. */ int x509_note_tbs_certificate(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("x509_note_tbs_certificate(,%zu,%02x,%ld,%zu)!\n", hdrlen, tag, (unsigned long)value - ctx->data, vlen); ctx->cert->tbs = value - hdrlen; ctx->cert->tbs_size = vlen + hdrlen; return 0; } /* * Record the algorithm that was used to sign this certificate. */ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("PubKey Algo: %u\n", ctx->last_oid); switch (ctx->last_oid) { default: return -ENOPKG; /* Unsupported combination */ case OID_sha1WithRSAEncryption: ctx->cert->sig->hash_algo = "sha1"; goto rsa_pkcs1; case OID_sha256WithRSAEncryption: ctx->cert->sig->hash_algo = "sha256"; goto rsa_pkcs1; case OID_sha384WithRSAEncryption: ctx->cert->sig->hash_algo = "sha384"; goto rsa_pkcs1; case OID_sha512WithRSAEncryption: ctx->cert->sig->hash_algo = "sha512"; goto rsa_pkcs1; case OID_sha224WithRSAEncryption: ctx->cert->sig->hash_algo = "sha224"; goto rsa_pkcs1; case OID_id_ecdsa_with_sha1: ctx->cert->sig->hash_algo = "sha1"; goto ecdsa; case OID_id_rsassa_pkcs1_v1_5_with_sha3_256: ctx->cert->sig->hash_algo = "sha3-256"; goto rsa_pkcs1; case OID_id_rsassa_pkcs1_v1_5_with_sha3_384: ctx->cert->sig->hash_algo = "sha3-384"; goto rsa_pkcs1; case OID_id_rsassa_pkcs1_v1_5_with_sha3_512: ctx->cert->sig->hash_algo = "sha3-512"; goto rsa_pkcs1; case OID_id_ecdsa_with_sha224: ctx->cert->sig->hash_algo = "sha224"; goto ecdsa; case OID_id_ecdsa_with_sha256: ctx->cert->sig->hash_algo = "sha256"; goto ecdsa; case OID_id_ecdsa_with_sha384: ctx->cert->sig->hash_algo = "sha384"; goto ecdsa; case OID_id_ecdsa_with_sha512: ctx->cert->sig->hash_algo = "sha512"; goto ecdsa; case OID_id_ecdsa_with_sha3_256: ctx->cert->sig->hash_algo = "sha3-256"; goto ecdsa; case OID_id_ecdsa_with_sha3_384: ctx->cert->sig->hash_algo = "sha3-384"; goto ecdsa; case OID_id_ecdsa_with_sha3_512: ctx->cert->sig->hash_algo = "sha3-512"; goto ecdsa; case OID_gost2012Signature256: ctx->cert->sig->hash_algo = "streebog256"; goto ecrdsa; case OID_gost2012Signature512: ctx->cert->sig->hash_algo = "streebog512"; goto ecrdsa; case OID_id_ml_dsa_44: ctx->cert->sig->pkey_algo = "mldsa44"; goto ml_dsa; case OID_id_ml_dsa_65: ctx->cert->sig->pkey_algo = "mldsa65"; goto ml_dsa; case OID_id_ml_dsa_87: ctx->cert->sig->pkey_algo = "mldsa87"; goto ml_dsa; } rsa_pkcs1: ctx->cert->sig->pkey_algo = "rsa"; ctx->cert->sig->encoding = "pkcs1"; ctx->sig_algo = ctx->last_oid; return 0; ecrdsa: ctx->cert->sig->pkey_algo = "ecrdsa"; ctx->cert->sig->encoding = "raw"; ctx->sig_algo = ctx->last_oid; return 0; ecdsa: ctx->cert->sig->pkey_algo = "ecdsa"; ctx->cert->sig->encoding = "x962"; ctx->sig_algo = ctx->last_oid; return 0; ml_dsa: ctx->cert->sig->algo_takes_data = true; ctx->cert->sig->hash_algo = "none"; ctx->cert->sig->encoding = "raw"; ctx->sig_algo = ctx->last_oid; return 0; } /* * Note the whereabouts and type of the signature. */ int x509_note_signature(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("Signature: alg=%u, size=%zu\n", ctx->last_oid, vlen); /* * In X.509 certificates, the signature's algorithm is stored in two * places: inside the TBSCertificate (the data that is signed), and * alongside the signature. These *must* match. */ if (ctx->last_oid != ctx->sig_algo) { pr_warn("signatureAlgorithm (%u) differs from tbsCertificate.signature (%u)\n", ctx->last_oid, ctx->sig_algo); return -EINVAL; } if (strcmp(ctx->cert->sig->pkey_algo, "rsa") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecrdsa") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecdsa") == 0 || strncmp(ctx->cert->sig->pkey_algo, "mldsa", 5) == 0) { /* Discard the BIT STRING metadata */ if (vlen < 1 || *(const u8 *)value != 0) return -EBADMSG; value++; vlen--; } ctx->cert->raw_sig = value; ctx->cert->raw_sig_size = vlen; return 0; } /* * Note the certificate serial number */ int x509_note_serial(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; ctx->cert->raw_serial = value; ctx->cert->raw_serial_size = vlen; return 0; } /* * Note some of the name segments from which we'll fabricate a name. */ int x509_extract_name_segment(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; switch (ctx->last_oid) { case OID_commonName: ctx->cn_size = vlen; ctx->cn_offset = (unsigned long)value - ctx->data; break; case OID_organizationName: ctx->o_size = vlen; ctx->o_offset = (unsigned long)value - ctx->data; break; case OID_email_address: ctx->email_size = vlen; ctx->email_offset = (unsigned long)value - ctx->data; break; default: break; } return 0; } /* * Fabricate and save the issuer and subject names */ static int x509_fabricate_name(struct x509_parse_context *ctx, size_t hdrlen, unsigned char tag, char **_name, size_t vlen) { const void *name, *data = (const void *)ctx->data; size_t namesize; char *buffer; if (*_name) return -EINVAL; /* Empty name string if no material */ if (!ctx->cn_size && !ctx->o_size && !ctx->email_size) { buffer = kzalloc(1, GFP_KERNEL); if (!buffer) return -ENOMEM; goto done; } if (ctx->cn_size && ctx->o_size) { /* Consider combining O and CN, but use only the CN if it is * prefixed by the O, or a significant portion thereof. */ namesize = ctx->cn_size; name = data + ctx->cn_offset; if (ctx->cn_size >= ctx->o_size && memcmp(data + ctx->cn_offset, data + ctx->o_offset, ctx->o_size) == 0) goto single_component; if (ctx->cn_size >= 7 && ctx->o_size >= 7 && memcmp(data + ctx->cn_offset, data + ctx->o_offset, 7) == 0) goto single_component; buffer = kmalloc(ctx->o_size + 2 + ctx->cn_size + 1, GFP_KERNEL); if (!buffer) return -ENOMEM; memcpy(buffer, data + ctx->o_offset, ctx->o_size); buffer[ctx->o_size + 0] = ':'; buffer[ctx->o_size + 1] = ' '; memcpy(buffer + ctx->o_size + 2, data + ctx->cn_offset, ctx->cn_size); buffer[ctx->o_size + 2 + ctx->cn_size] = 0; goto done; } else if (ctx->cn_size) { namesize = ctx->cn_size; name = data + ctx->cn_offset; } else if (ctx->o_size) { namesize = ctx->o_size; name = data + ctx->o_offset; } else { namesize = ctx->email_size; name = data + ctx->email_offset; } single_component: buffer = kmalloc(namesize + 1, GFP_KERNEL); if (!buffer) return -ENOMEM; memcpy(buffer, name, namesize); buffer[namesize] = 0; done: *_name = buffer; ctx->cn_size = 0; ctx->o_size = 0; ctx->email_size = 0; return 0; } int x509_note_issuer(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; ctx->cert->raw_issuer = value; ctx->cert->raw_issuer_size = vlen; if (!ctx->cert->sig->auth_ids[2]) { kid = asymmetric_key_generate_id(value, vlen, "", 0); if (IS_ERR(kid)) return PTR_ERR(kid); ctx->cert->sig->auth_ids[2] = kid; } return x509_fabricate_name(ctx, hdrlen, tag, &ctx->cert->issuer, vlen); } int x509_note_subject(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; ctx->cert->raw_subject = value; ctx->cert->raw_subject_size = vlen; return x509_fabricate_name(ctx, hdrlen, tag, &ctx->cert->subject, vlen); } /* * Extract the parameters for the public key */ int x509_note_params(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; /* * AlgorithmIdentifier is used three times in the x509, we should skip * first and ignore third, using second one which is after subject and * before subjectPublicKey. */ if (!ctx->cert->raw_subject || ctx->key) return 0; ctx->params = value - hdrlen; ctx->params_size = vlen + hdrlen; return 0; } /* * Extract the data for the public key algorithm */ int x509_extract_key_data(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; enum OID oid; ctx->key_algo = ctx->last_oid; switch (ctx->last_oid) { case OID_rsaEncryption: ctx->cert->pub->pkey_algo = "rsa"; break; case OID_gost2012PKey256: case OID_gost2012PKey512: ctx->cert->pub->pkey_algo = "ecrdsa"; break; case OID_id_ecPublicKey: if (parse_OID(ctx->params, ctx->params_size, &oid) != 0) return -EBADMSG; switch (oid) { case OID_id_prime192v1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p192"; break; case OID_id_prime256v1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p256"; break; case OID_id_ansip384r1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p384"; break; case OID_id_ansip521r1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p521"; break; default: return -ENOPKG; } break; case OID_id_ml_dsa_44: ctx->cert->pub->pkey_algo = "mldsa44"; break; case OID_id_ml_dsa_65: ctx->cert->pub->pkey_algo = "mldsa65"; break; case OID_id_ml_dsa_87: ctx->cert->pub->pkey_algo = "mldsa87"; break; default: return -ENOPKG; } /* Discard the BIT STRING metadata */ if (vlen < 1 || *(const u8 *)value != 0) return -EBADMSG; ctx->key = value + 1; ctx->key_size = vlen - 1; return 0; } /* The keyIdentifier in AuthorityKeyIdentifier SEQUENCE is tag(CONT,PRIM,0) */ #define SEQ_TAG_KEYID (ASN1_CONT << 6) /* * Process certificate extensions that are used to qualify the certificate. */ int x509_process_extension(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; const unsigned char *v = value; pr_debug("Extension: %u\n", ctx->last_oid); if (ctx->last_oid == OID_subjectKeyIdentifier) { /* Get hold of the key fingerprint */ if (ctx->cert->skid || vlen < 3) return -EBADMSG; if (v[0] != ASN1_OTS || v[1] != vlen - 2) return -EBADMSG; v += 2; vlen -= 2; ctx->cert->raw_skid_size = vlen; ctx->cert->raw_skid = v; kid = asymmetric_key_generate_id(v, vlen, "", 0); if (IS_ERR(kid)) return PTR_ERR(kid); ctx->cert->skid = kid; pr_debug("subjkeyid %*phN\n", kid->len, kid->data); return 0; } if (ctx->last_oid == OID_keyUsage) { /* * Get hold of the keyUsage bit string * v[1] is the encoding size * (Expect either 0x02 or 0x03, making it 1 or 2 bytes) * v[2] is the number of unused bits in the bit string * (If >= 3 keyCertSign is missing when v[1] = 0x02) * v[3] and possibly v[4] contain the bit string * * From RFC 5280 4.2.1.3: * 0x04 is where keyCertSign lands in this bit string * 0x80 is where digitalSignature lands in this bit string */ if (v[0] != ASN1_BTS) return -EBADMSG; if (vlen < 4) return -EBADMSG; if (v[2] >= 8) return -EBADMSG; if (v[3] & 0x80) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_DIGITALSIG; if (v[1] == 0x02 && v[2] <= 2 && (v[3] & 0x04)) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN; else if (vlen > 4 && v[1] == 0x03 && (v[3] & 0x04)) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_KEYCERTSIGN; return 0; } if (ctx->last_oid == OID_authorityKeyIdentifier) { /* Get hold of the CA key fingerprint */ ctx->raw_akid = v; ctx->raw_akid_size = vlen; return 0; } if (ctx->last_oid == OID_basicConstraints) { /* * Get hold of the basicConstraints * v[1] is the encoding size * (Expect 0x00 for empty SEQUENCE with CA:FALSE, or * 0x03 or greater for non-empty SEQUENCE) * v[2] is the encoding type * (Expect an ASN1_BOOL for the CA) * v[3] is the length of the ASN1_BOOL * (Expect 1 for a single byte boolean) * v[4] is the contents of the ASN1_BOOL * (Expect 0xFF if the CA is TRUE) * vlen should match the entire extension size */ if (v[0] != (ASN1_CONS_BIT | ASN1_SEQ)) return -EBADMSG; if (vlen < 2) return -EBADMSG; if (v[1] != vlen - 2) return -EBADMSG; /* Empty SEQUENCE means CA:FALSE (default value omitted per DER) */ if (v[1] == 0) return 0; if (vlen >= 5 && v[2] == ASN1_BOOL && v[3] == 1 && v[4] == 0xFF) ctx->cert->pub->key_eflags |= 1 << KEY_EFLAG_CA; else return -EBADMSG; return 0; } return 0; } /** * x509_decode_time - Decode an X.509 time ASN.1 object * @_t: The time to fill in * @hdrlen: The length of the object header * @tag: The object tag * @value: The object value * @vlen: The size of the object value * * Decode an ASN.1 universal time or generalised time field into a struct the * kernel can handle and check it for validity. The time is decoded thus: * * [RFC5280 §4.1.2.5] * CAs conforming to this profile MUST always encode certificate validity * dates through the year 2049 as UTCTime; certificate validity dates in * 2050 or later MUST be encoded as GeneralizedTime. Conforming * applications MUST be able to process validity dates that are encoded in * either UTCTime or GeneralizedTime. */ int x509_decode_time(time64_t *_t, size_t hdrlen, unsigned char tag, const unsigned char *value, size_t vlen) { static const unsigned char month_lengths[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; const unsigned char *p = value; unsigned year, mon, day, hour, min, sec, mon_len; #define dec2bin(X) ({ unsigned char x = (X) - '0'; if (x > 9) goto invalid_time; x; }) #define DD2bin(P) ({ unsigned x = dec2bin(P[0]) * 10 + dec2bin(P[1]); P += 2; x; }) if (tag == ASN1_UNITIM) { /* UTCTime: YYMMDDHHMMSSZ */ if (vlen != 13) goto unsupported_time; year = DD2bin(p); if (year >= 50) year += 1900; else year += 2000; } else if (tag == ASN1_GENTIM) { /* GenTime: YYYYMMDDHHMMSSZ */ if (vlen != 15) goto unsupported_time; year = DD2bin(p) * 100 + DD2bin(p); if (year >= 1950 && year <= 2049) goto invalid_time; } else { goto unsupported_time; } mon = DD2bin(p); day = DD2bin(p); hour = DD2bin(p); min = DD2bin(p); sec = DD2bin(p); if (*p != 'Z') goto unsupported_time; if (year < 1970 || mon < 1 || mon > 12) goto invalid_time; mon_len = month_lengths[mon - 1]; if (mon == 2) { if (year % 4 == 0) { mon_len = 29; if (year % 100 == 0) { mon_len = 28; if (year % 400 == 0) mon_len = 29; } } } if (day < 1 || day > mon_len || hour > 24 || /* ISO 8601 permits 24:00:00 as midnight tomorrow */ min > 59 || sec > 60) /* ISO 8601 permits leap seconds [X.680 46.3] */ goto invalid_time; *_t = mktime64(year, mon, day, hour, min, sec); return 0; unsupported_time: pr_debug("Got unsupported time [tag %02x]: '%*phN'\n", tag, (int)vlen, value); return -EBADMSG; invalid_time: pr_debug("Got invalid time [tag %02x]: '%*phN'\n", tag, (int)vlen, value); return -EBADMSG; } EXPORT_SYMBOL_GPL(x509_decode_time); int x509_note_not_before(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; return x509_decode_time(&ctx->cert->valid_from, hdrlen, tag, value, vlen); } int x509_note_not_after(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; return x509_decode_time(&ctx->cert->valid_to, hdrlen, tag, value, vlen); } /* * Note a key identifier-based AuthorityKeyIdentifier */ int x509_akid_note_kid(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; pr_debug("AKID: keyid: %*phN\n", (int)vlen, value); if (ctx->cert->sig->auth_ids[1]) return 0; kid = asymmetric_key_generate_id(value, vlen, "", 0); if (IS_ERR(kid)) return PTR_ERR(kid); pr_debug("authkeyid %*phN\n", kid->len, kid->data); ctx->cert->sig->auth_ids[1] = kid; return 0; } /* * Note a directoryName in an AuthorityKeyIdentifier */ int x509_akid_note_name(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; pr_debug("AKID: name: %*phN\n", (int)vlen, value); ctx->akid_raw_issuer = value; ctx->akid_raw_issuer_size = vlen; return 0; } /* * Note a serial number in an AuthorityKeyIdentifier */ int x509_akid_note_serial(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct x509_parse_context *ctx = context; struct asymmetric_key_id *kid; pr_debug("AKID: serial: %*phN\n", (int)vlen, value); if (!ctx->akid_raw_issuer || ctx->cert->sig->auth_ids[0]) return 0; kid = asymmetric_key_generate_id(value, vlen, ctx->akid_raw_issuer, ctx->akid_raw_issuer_size); if (IS_ERR(kid)) return PTR_ERR(kid); pr_debug("authkeyid %*phN\n", kid->len, kid->data); ctx->cert->sig->auth_ids[0] = kid; return 0; }
781 745 380 381 381 463 30 356 380 7 7 10 2 38 5 40 40 69 69 69 69 61 1 65 22 61 51 178 178 177 175 4 178 20 547 6 441 442 441 313 312 313 313 314 314 519 519 261 166 167 440 442 442 441 441 56 392 108 121 458 65 543 15 15 15 15 15 15 15 15 15 15 149 149 149 10 149 545 546 546 544 541 5 540 542 540 544 15 543 538 385 7 103 474 537 494 42 536 535 533 4 5 9 511 17 17 17 4 4 4 16 3 16 15 1 10 5 5 5 440 442 350 350 4 442 4 4 389 388 386 386 389 387 389 385 6 2 2 2 7 157 164 188 164 164 61 2 2 6 180 179 63 164 188 6 6 5 7 7 40 40 40 9 7 1 6 5 2 6 7 11 1 2 10 15 7 20 2 24 6 18 2 7 9 4 10 17 17 9 8 6 20 3 7 40 7 30 12 40 11 29 40 241 361 362 3 241 242 242 244 266 264 268 232 38 6 461 463 463 462 464 213 213 213 70 8 31 31 14 14 1 1 3 9 1 1 3 1 1 4 58 59 1 5 2 51 4 4 46 1 1 42 9 11 1 37 2 35 34 4 1 5 34 4 27 9 19 2 20 20 1 1 1 1 8 1 1 2 1 1 1 1 1 20 19 20 20 20 1 1 1 20 20 20 13 1 1 3 8 8 8 7 8 22 22 22 7 22 2 21 20 20 20 19 20 22 418 3 5 20 3 418 417 7 2 2 2 2 12 4 16 14 14 14 14 14 14 12 3 3 3 2 2 2 2 17 9 7 7 4 4 4 7 16 1 16 17 10 16 3 17 1 17 16 15 3 14 15 17 1 3 3 1 24 1 8 1 16 8 30 30 8 8 6 1 1 1 8 405 402 405 401 406 506 506 500 500 502 499 506 253 250 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> #include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <trace/events/neigh.h> #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ if (level <= NEIGH_DEBUG) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) { int i; switch (family) { default: DEBUG_NET_WARN_ON_ONCE(1); fallthrough; /* to avoid panic by null-ptr-deref */ case AF_INET: i = NEIGH_ARP_TABLE; break; case AF_INET6: i = NEIGH_ND_TABLE; break; } return &dev->neighbours[i]; } /* Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks to protocol backends, no attempts to send something to network. It will result in deadlocks, if backend/driver wants to use neighbour cache. - If the entry requires some non-trivial actions, increase its reference count and release table lock. Neighbour entries are protected: - with reference count. - with rwlock neigh->lock Reference count prevents destruction. neigh->lock mainly serializes ll address data and its validity state. However, the same lock is used to protect another entry fields: - timer - resolution queue Again, nothing clever shall be made under neigh->lock, the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. */ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, * because it is really reasonable choice. */ unsigned long neigh_rand_reach_time(unsigned long base) { return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); static void neigh_mark_dead(struct neighbour *n) { n->dead = 1; if (!list_empty(&n->gc_list)) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } if (!list_empty(&n->managed_list)) list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; /* remove from the gc list if new state is permanent or if neighbor is * externally learned / validated; otherwise entry should be on the gc * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } out: write_unlock(&n->lock); spin_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; add_to_managed = n->flags & NTF_MANAGED; on_managed_list = !list_empty(&n->managed_list); if (!add_to_managed && on_managed_list) list_del_init(&n->managed_list); else if (add_to_managed && !on_managed_list) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); spin_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, bool *gc_update, bool *managed_update) { u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) return; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; *notify = 1; *gc_update = true; } if ((old_flags ^ ndm_flags) & NTF_MANAGED) { if (ndm_flags & NTF_MANAGED) neigh->flags |= NTF_MANAGED; else neigh->flags &= ~NTF_MANAGED; *notify = 1; *managed_update = true; } if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { if (ndm_flags & NTF_EXT_VALIDATED) neigh->flags |= NTF_EXT_VALIDATED; else neigh->flags &= ~NTF_EXT_VALIDATED; *notify = 1; *gc_update = true; } } bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); if (retval) neigh_cleanup_and_release(n); return retval; } static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - READ_ONCE(tbl->gc_thresh2); u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); spin_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { bool remove = false; write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; if (++loop == 16) { if (ktime_get_ns() > tmax) goto unlock; loop = 0; } } } WRITE_ONCE(tbl->last_flush, jiffies); unlock: spin_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { /* Use safe distance from the jiffies - LONG_MAX point while timer * is running in DELAY/PROBE state but still show to user space * large times in the past. */ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); neigh_hold(n); if (!time_in_range(n->confirmed, mint, jiffies)) n->confirmed = mint; if (time_before(n->used, n->confirmed)) n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); dump_stack(); } } static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && timer_delete(&n->timer)) { neigh_release(n); return 1; } return 0; } static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, int family) { switch (family) { case AF_INET: return __in_dev_arp_parms_get_rcu(dev); case AF_INET6: return __in6_dev_nd_parms_get_rcu(dev); } return NULL; } static void neigh_parms_qlen_dec(struct net_device *dev, int family) { struct neigh_parms *p; rcu_read_lock(); p = neigh_get_dev_parms_rcu(dev, family); if (p) p->qlen--; rcu_read_unlock(); } static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, int family) { struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } static void neigh_flush_one(struct neighbour *n) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. * We must destroy neighbour entry, * but someone still uses it. * * The destroy will be delayed until * the last user releases us, but * we must kill timers etc. and move * it to safe state. */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct hlist_head *dev_head; struct hlist_node *tmp; struct neighbour *n; dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { if (skip_perm && (n->nud_state & NUD_PERMANENT || n->flags & NTF_EXT_VALIDATED)) continue; neigh_flush_one(n); } } static void neigh_flush_table(struct neigh_table *tbl) { struct neigh_hash_table *nht; int i; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) neigh_flush_one(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { spin_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { spin_lock_bh(&tbl->lock); if (likely(dev)) { neigh_flush_dev(tbl, dev, skip_perm); } else { DEBUG_NET_WARN_ON_ONCE(skip_perm); neigh_flush_table(tbl); } spin_unlock_bh(&tbl->lock); pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) timer_delete_sync(&tbl->proxy_timer); return 0; } int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, true); return 0; } EXPORT_SYMBOL(neigh_carrier_down); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, false); return 0; } EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; gc_thresh3 = READ_ONCE(tbl->gc_thresh3); if (entries >= gc_thresh3 || (entries >= READ_ONCE(tbl->gc_thresh2) && time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; } } do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: return n; out_entries: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; ret = kmalloc_obj(*ret, GFP_ATOMIC); if (!ret) return NULL; hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; } ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); kfree(nht->hash_heads); kfree(nht); } static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, unsigned long new_shift) { unsigned int i, hash; struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); new_nht = neigh_hash_alloc(new_shift); if (!new_nht) return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu); return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) { u32 hash_val, key_len = tbl->key_len; struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; int error; n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (n->flags & NTF_MANAGED) list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: spin_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; } EXPORT_IPV6_MOD(pneigh_lookup); int pneigh_create(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev, u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) { err = -ENOBUFS; goto out; } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); update: WRITE_ONCE(n->flags, flags); n->permanent = permanent; if (protocol) WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); return err; } static void pneigh_destroy(struct rcu_head *rcu) { struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); netdev_put(n->dev, &n->dev_tracker); kfree(n); } int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, __rcu **np; unsigned int key_len; u32 hash_val; key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); mutex_lock(&tbl->phash_lock); for (np = &tbl->phash_buckets[hash_val]; (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); call_rcu(&n->rcu, pneigh_destroy); return 0; } } mutex_unlock(&tbl->phash_lock); return -ENOENT; } static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct pneigh_entry *n, __rcu **np; LIST_HEAD(head); u32 h; mutex_lock(&tbl->phash_lock); for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = rcu_dereference_protected(*np, 1)) != NULL) { if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { rcu_assign_pointer(*np, n->next); list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); list_del(&n->free_node); if (tbl->pdestructor) tbl->pdestructor(n); call_rcu(&n->rcu, pneigh_destroy); } } static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) kfree(parms); } /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); write_lock_bh(&neigh->lock); __skb_queue_purge(&neigh->arp_queue); write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; enable fast path. Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; struct hlist_node *tmp; struct neighbour *n; unsigned int i; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) neigh_set_reach_time(p); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } if (time_before(n->used, n->confirmed) && time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ spin_unlock_bh(&tbl->lock); cond_resched(); spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated routine. Particularly, it can hit the same neighbour entry! So that, we try to be accurate and avoid dead loop. --ANK */ while (neigh->nud_state == NUD_FAILED && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); bool skip_probe = false; unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { if (neigh->nud_state == NUD_PROBE && neigh->flags & NTF_EXT_VALIDATED) { WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh_invalidate(neigh); } notify = 1; skip_probe = true; } if (notify) __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0); if (skip_probe) goto out; if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } if (notify) call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); trace_neigh_timer_handler(neigh, 0); neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; if (!immediate_ok) { next = now + 1; } else { immediate_probe = true; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ / 100); } neigh_add_timer(neigh, next); } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); else write_unlock(&neigh->lock); local_bh_enable(); trace_neigh_event_send_done(neigh, rc); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } EXPORT_SYMBOL(__neigh_event_send); static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) = NULL; if (neigh->dev->header_ops) update = neigh->dev->header_ops->cache_update; if (update) { hh = &neigh->hh; if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); } } } static void neigh_update_process_arp_queue(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; /* Again: avoid deadlock if something went wrong. */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst && READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed or invalidated. Caller MUST hold reference count on the entry. */ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; bool process_arp_queue = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); new = old; goto out; } if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update); if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; WRITE_ONCE(neigh->nud_state, new); err = 0; goto out; } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) { NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; } lladdr = neigh->ha; } /* Update confirmed timestamp for neighbour entry after we * received ARP packet even if it doesn't change IP to MAC binding. */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); WRITE_ONCE(neigh->nud_state, new); notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); else neigh_suspect(neigh); if (!(old & NUD_VALID)) process_arp_queue = true; out: if (update_isrouter) neigh_update_is_router(neigh, flags, &notify); if (notify) __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); if (process_arp_queue) neigh_update_process_arp_queue(neigh); write_unlock_bh(&neigh->lock); if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); if (notify) call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); trace_neigh_update_done(neigh, err); return err; } int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid) { return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); } EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. */ void __neigh_set_probe_once(struct neighbour *neigh) { if (neigh->dead) return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n) { struct net_device *dev = n->dev; __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); /* Only one thread can come in here and initialize the * hh_cache entry. */ if (!hh->hh_len) dev->header_ops->cache(n, hh, prot); write_unlock_bh(&n->lock); } /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = neigh->dev; unsigned int seq; int err; do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } EXPORT_SYMBOL(neigh_connected_output); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) { return dev_queue_xmit(skb); } EXPORT_SYMBOL(neigh_direct_output); static void neigh_managed_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, managed_work.work); struct neighbour *neigh; spin_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); spin_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { long tdif = NEIGH_CB(skb)->sched_next - now; if (tdif <= 0) { struct net_device *dev = skb->dev; neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); rcu_read_unlock(); } else { kfree_skb(skb); } dev_put(dev); } else if (!sched_next || tdif < sched_next) sched_next = tdif; } timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } static unsigned long neigh_proxy_delay(struct neigh_parms *p) { /* If proxy_delay is zero, do not call get_random_u32_below() * as it is undefined behavior. */ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); return proxy_delay ? jiffies + get_random_u32_below(proxy_delay) : jiffies; } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } NEIGH_CB(skb)->sched_next = sched_next; NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } EXPORT_SYMBOL(pneigh_enqueue); static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } return NULL; } struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); neigh_set_reach_time(p); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } spin_lock_bh(&tbl->lock); list_add_rcu(&p->list, &tbl->parms.list); spin_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } return p; } EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; spin_lock_bh(&tbl->lock); list_del_rcu(&parms->list); parms->dead = 1; spin_unlock_bh(&tbl->lock); netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); INIT_LIST_HEAD(&tbl->managed_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); neigh_set_reach_time(&tbl->parms); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); if (!tbl->entry_size) tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + tbl->key_len, NEIGH_PRIV_ALIGN); else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); spin_lock_init(&tbl->lock); mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); /* * Only called from ndisc_cleanup(), which means this is dead code * because we no longer can unload IPv6 module. */ int neigh_table_clear(int index, struct neigh_table *tbl) { RCU_INIT_POINTER(neigh_tables[index], NULL); synchronize_rcu(); /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; remove_proc_entry(tbl->id, init_net.proc_net_stat); free_percpu(tbl->stats); tbl->stats = NULL; return 0; } EXPORT_SYMBOL(neigh_table_clear); static struct neigh_table *neigh_find_table(int family) { struct neigh_table *tbl = NULL; switch (family) { case AF_INET: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } return tbl; } const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, [NDA_PROBES] = { .type = NLA_U32 }, [NDA_VLAN] = { .type = NLA_U16 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); if (!dst_attr) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(dst_attr) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; goto out; } err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); spin_unlock_bh(&tbl->lock); out: return err; } static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; u32 ndm_flags; int err; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); ndm_flags = ndm->ndm_flags; if (tb[NDA_FLAGS_EXT]) { u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + hweight32(NTF_EXT_MASK))); ndm_flags |= (ext << NTF_EXT_SHIFT); } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } if (!dev) { NL_SET_ERR_MSG(extack, "Device not specified"); goto out; } if (tbl->allow_add && !tbl->allow_add(dev, extack)) { err = -EINVAL; goto out; } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || ndm_flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); err = -EINVAL; goto out; } if (ndm_flags & NTF_EXT_VALIDATED) { u8 state = ndm->ndm_state; /* NTF_USE and NTF_MANAGED will result in the neighbor * being created with an invalid state (NUD_NONE). */ if (ndm_flags & (NTF_USE | NTF_MANAGED)) state = NUD_NONE; if (!(state & NUD_VALID)) { NL_SET_ERR_MSG(extack, "Cannot create externally validated neighbor with an invalid state"); err = -EINVAL; goto out; } } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & (NTF_EXT_LEARNED | NTF_MANAGED | NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); goto out; } if (ndm_flags & NTF_EXT_VALIDATED) { u8 state = ndm->ndm_state; /* NTF_USE and NTF_MANAGED do not update the existing * state other than clearing it if it was * NUD_PERMANENT. */ if (ndm_flags & (NTF_USE | NTF_MANAGED)) state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; if (!(state & NUD_VALID)) { NL_SET_ERR_MSG(extack, "Cannot mark neighbor as externally validated with an invalid state"); err = -EINVAL; neigh_release(neigh); goto out; } } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (protocol) neigh->protocol = protocol; if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; if (ndm_flags & NTF_MANAGED) flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; if (ndm_flags & NTF_EXT_VALIDATED) flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) neigh_event_send(neigh, NULL); neigh_release(neigh); out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || nla_put_u32(skb, NDTPA_UCAST_PROBES, NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; long flush_delta = now - READ_ONCE(tbl->last_flush); long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; } { int cpu; struct ndt_stats ndst; memset(&ndst, 0, sizeof(ndst)); for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += READ_ONCE(st->allocs); ndst.ndts_destroys += READ_ONCE(st->destroys); ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); ndst.ndts_res_failed += READ_ONCE(st->res_failed); ndst.ndts_lookups += READ_ONCE(st->lookups); ndst.ndts_hits += READ_ONCE(st->hits); ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, NDTA_PAD)) goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neightbl_fill_param_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_parms *parms, u32 pid, u32 seq, int type, unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || neightbl_fill_parms(skb, parms) < 0) goto errout; nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, [NDTA_THRESH2] = { .type = NLA_U32 }, [NDTA_THRESH3] = { .type = NLA_U32 }, [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, [NDTA_PARMS] = { .type = NLA_NESTED }, }; static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; bool found = false; int err, tidx; err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy, extack); if (err < 0) goto errout; if (tb[NDTA_NAME] == NULL) { err = -EINVAL; goto errout; } ndtmsg = nlmsg_data(nlh); rcu_read_lock(); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } if (!found) { rcu_read_unlock(); err = -ENOENT; goto errout; } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, tb[NDTA_PARMS], nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; } for (i = 1; i <= NDTPA_MAX; i++) { if (tbp[i] == NULL) continue; switch (i) { case NDTPA_QUEUE_LEN: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i]) * SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: NEIGH_VAR_SET(p, PROXY_QLEN, nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: NEIGH_VAR_SET(p, APP_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: NEIGH_VAR_SET(p, UCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_REPROBES: NEIGH_VAR_SET(p, MCAST_REPROBES, nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_INTERVAL_PROBE_TIME_MS: NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); break; } } } err = -ENOENT; if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && !net_eq(net, &init_net)) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: spin_unlock_bh(&tbl->lock); rcu_read_unlock(); errout: return err; } static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ndtmsg *ndtm; ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); return -EINVAL; } return 0; } static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; if (cb->strict_check) { int err = neightbl_valid_dump_info(nlh, cb->extack); if (err < 0) return err; } family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; rcu_read_lock(); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; nidx = 0; p = list_next_entry(&tbl->parms, list); list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; if (nidx < neigh_skip) goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; next: nidx++; } neigh_skip = 0; } out: rcu_read_unlock(); cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; neigh_flags = neigh->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) goto nla_put_failure; } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) __releases(neigh->lock) __acquires(neigh->lock) { int err; read_lock_bh(&neigh->lock); err = __neigh_fill_info(skb, neigh, pid, seq, type, flags); read_unlock_bh(&neigh->lock); return err; } static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags = READ_ONCE(pn->flags); neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; protocol = READ_ONCE(pn->protocol); if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; } struct neigh_dump_filter { int master_idx; int dev_idx; }; static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); struct neighbour *n; int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; idx = 0; neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags); if (err < 0) goto out; next: idx++; } } out: cb->args[1] = h; cb->args[2] = idx; return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; n; n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); if (err < 0) goto out; next: idx++; } } out: cb->args[3] = h; cb->args[4] = idx; return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, bool strict_check, struct neigh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; int err, i; if (strict_check) { struct ndmsg *ndm; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } if (err < 0) return err; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: filter->master_idx = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); return -EINVAL; } } } return 0; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; err = 0; s_t = cb->args[0]; rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) err = pneigh_dump_table(tbl, skb, cb, &filter); else err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = t; return err; } static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return ERR_PTR(-EINVAL); } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); return ERR_PTR(-EINVAL); } if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { NL_SET_ERR_MSG(extack, "No device specified"); return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return ERR_PTR(err); for (i = 0; i <= NDA_MAX; ++i) { switch (i) { case NDA_DST: if (!tb[i]) { NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); return ERR_PTR(-EINVAL); } break; default: if (!tb[i]) continue; NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return ERR_PTR(-EINVAL); } } return ndm; } static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); u32 pid = NETLINK_CB(in_skb).portid; struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev = NULL; u32 seq = nlh->nlmsg_seq; struct neigh_table *tbl; struct neighbour *neigh; struct sk_buff *skb; struct ndmsg *ndm; void *dst; int err; ndm = neigh_valid_get_req(nlh, tb, extack); if (IS_ERR(ndm)) return PTR_ERR(ndm); if (ndm->ndm_flags & NTF_PROXY) skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); else skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; rcu_read_lock(); tbl = neigh_find_table(ndm->ndm_family); if (!tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); err = -EAFNOSUPPORT; goto err_unlock; } if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); err = -EINVAL; goto err_unlock; } dst = nla_data(tb[NDA_DST]); if (ndm->ndm_ifindex) { dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); err = -ENODEV; goto err_unlock; } } if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; goto err_unlock; } err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) goto err_unlock; } else { neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); err = -ENOENT; goto err_unlock; } err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); neigh_release(neigh); if (err) goto err_unlock; } rcu_read_unlock(); return rtnl_unicast(skb, net, pid); err_unlock: rcu_read_unlock(); kfree_skb(skb); return err; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; struct neigh_hash_table *nht; rcu_read_lock(); nht = rcu_dereference(tbl->nht); spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); /* The tbl->lock must be held as a writer and BH disabled. */ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { struct neigh_hash_table *nht; int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); } } } EXPORT_SYMBOL(__neigh_for_each_release); int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; rcu_read_lock(); tbl = rcu_dereference(neigh_tables[index]); if (!tbl) goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); neigh = __ipv4_neigh_lookup_noref(dev, key); } else { neigh = __neigh_lookup_noref(tbl, addr, dev); } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { rcu_read_unlock(); goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), addr, NULL, skb->len); if (err < 0) goto out_kfree_skb; err = dev_queue_xmit(skb); } out: return err; out_kfree_skb: kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_valid(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); if (!net_eq(dev_net(n->dev), net)) return NULL; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); if (!v) return NULL; if (pos) return v; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) return n; if (READ_ONCE(n->nud_state) & ~NUD_NOARP) return n; return NULL; } static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; while (++state->bucket < (1 << nht->hash_shift)) { neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { tmp = neigh_get_valid(seq, n, NULL); if (tmp) return tmp; } } return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } hlist_for_each_entry_continue(n, hash) { tmp = neigh_get_valid(seq, n, pos); if (tmp) { n = tmp; goto out; } } n = neigh_get_first(seq); out: if (n && pos) --(*pos); return n; } static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) { struct neighbour *n = neigh_get_first(seq); if (n) { --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) break; } } return *pos ? NULL : n; } static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = rcu_dereference(tbl->phash_buckets[bucket]); while (pn && !net_eq(pneigh_net(pn), net)) pn = rcu_dereference(pn->next); if (pn) break; } state->bucket = bucket; return pn; } static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct pneigh_entry *pn, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; do { pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = rcu_dereference(tbl->phash_buckets[state->bucket]); while (pn && !net_eq(pneigh_net(pn), net)) pn = rcu_dereference(pn->next); if (pn) break; } if (pn && pos) --(*pos); return pn; } static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) { struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) break; } } return *pos ? NULL : pn; } static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; loff_t idxpos = *pos; rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_seq_state *state; void *rc; if (v == SEQ_START_TOKEN) { rc = neigh_get_first(seq); goto out; } state = seq->private; if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { rc = neigh_get_next(seq, v, NULL); if (rc) goto out; if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_first(seq); } else { BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); rc = pneigh_get_next(seq, v, NULL); } out: ++(*pos); return rc; } EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) __releases(rcu) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); /* statistics via seq_file */ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } return NULL; } static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } (*pos)++; return NULL; } static void neigh_stat_seq_stop(struct seq_file *seq, void *v) { } static int neigh_stat_seq_show(struct seq_file *seq, void *v) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " "%08lx %08lx %08lx " "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, st->destroys, st->hash_grows, st->lookups, st->hits, st->res_failed, st->rcv_probes_mcast, st->rcv_probes_ucast, st->periodic_gc_runs, st->forced_gc_runs, st->unres_discards, st->table_fulls ); return 0; } static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; #endif /* CONFIG_PROC_FS */ static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { struct sk_buff *skb; int err = -ENOBUFS; struct net *net; rcu_read_lock(); net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = __neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); out: rcu_read_unlock(); } static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid) { read_lock_bh(&neigh->lock); __neigh_notify(neigh, type, flags, pid); read_unlock_bh(&neigh->lock); } void neigh_app_ns(struct neighbour *n) { neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; } static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { struct net_device *dev; int family = neigh_parms_family(p); rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct neigh_parms *dst_p = neigh_get_dev_parms_rcu(dev, family); if (dst_p && !test_bit(index, dst_p->data_state)) dst_p->data[index] = p->data[index]; } rcu_read_unlock(); } static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) return; set_bit(index, p->data_state); if (index == NEIGH_VAR_DELAY_PROBE_TIME) call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; int min = msecs_to_jiffies(1); tmp.extra1 = &min; tmp.extra2 = NULL; ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0) { /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it */ neigh_set_reach_time(p); } return ret; } #define NEIGH_PARMS_DATA_OFFSET(index) \ (&((struct neigh_parms *) 0)->data[index]) #define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ [NEIGH_VAR_ ## attr] = { \ .procname = name, \ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ } #define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) #define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *handler) { int i; struct neigh_sysctl_table *t; const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; } neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } else { /* Those handlers will update p->reachable_time after * base_reachable_time(_ms) is set to ensure the new timer starts being * applied after the next neighbour update instead of waiting for * neigh_periodic_work to update its value (can be multiple minutes) * So any handler that replaces them should do this as well */ /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = neigh_proc_base_reachable_time; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = neigh_proc_base_reachable_time; } switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; break; case AF_INET6: p_name = "ipv6"; break; default: BUG(); } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), neigh_path, t->neigh_vars, neigh_vars_size); if (!t->sysctl_header) goto free; p->sysctl_table = t; return 0; free: kfree(t); err: return -ENOBUFS; } EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; static int __init neigh_init(void) { rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } subsys_initcall(neigh_init);
5120 4 4365 1305 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PERCPU_COUNTER_H #define _LINUX_PERCPU_COUNTER_H /* * A simple "approximate counter" for use in ext2 and ext3 superblocks. * * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4. */ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/list.h> #include <linux/threads.h> #include <linux/percpu.h> #include <linux/types.h> /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX #ifdef CONFIG_SMP struct percpu_counter { raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ #endif s32 __percpu *counters; }; extern int percpu_counter_batch; int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key); #define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ &__key); \ }) #define percpu_counter_init(fbc, value, gfp) \ percpu_counter_init_many(fbc, value, gfp, 1) void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters); static inline void percpu_counter_destroy(struct percpu_counter *fbc) { percpu_counter_destroy_many(fbc, 1); } void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { return __percpu_counter_compare(fbc, rhs, percpu_counter_batch); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { return __percpu_counter_limited_add(fbc, limit, amount, percpu_counter_batch); } /* * With percpu_counter_add_local() and percpu_counter_sub_local(), counts * are accumulated in local per cpu counter and not in fbc->count until * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter * write efficient. * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be * used to add up the counts from each CPU to account for all the local * counts. So percpu_counter_add_local() and percpu_counter_sub_local() * should be used when a counter is updated frequently and read rarely. */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); return ret < 0 ? 0 : ret; } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return __percpu_counter_sum(fbc); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * It is possible for the percpu_counter_read() to return a small negative * number for some counter which should never be negative. * */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { /* Prevent reloads of fbc->count */ s64 ret = READ_ONCE(fbc->count); if (ret >= 0) return ret; return 0; } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return (fbc->counters != NULL); } #else /* !CONFIG_SMP */ struct percpu_counter { s64 count; }; static inline int percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters) { u32 i; for (i = 0; i < nr_counters; i++) fbc[i].count = amount; return 0; } static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp) { return percpu_counter_init_many(fbc, amount, gfp, 1); } static inline void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { } static inline void percpu_counter_destroy(struct percpu_counter *fbc) { } static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { fbc->count = amount; } static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { if (fbc->count > rhs) return 1; else if (fbc->count < rhs) return -1; else return 0; } static inline int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { return percpu_counter_compare(fbc, rhs); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { unsigned long flags; local_irq_save(flags); fbc->count += amount; local_irq_restore(flags); } static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { unsigned long flags; bool good = false; s64 count; if (amount == 0) return true; local_irq_save(flags); count = fbc->count + amount; if ((amount > 0 && count <= limit) || (amount < 0 && count >= limit)) { fbc->count = count; good = true; } local_irq_restore(flags); return good; } /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add(fbc, amount); } static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { percpu_counter_add(fbc, amount); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * percpu_counter is intended to track positive numbers. In the UP case the * number should never be negative. */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { return fbc->count; } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { return percpu_counter_read_positive(fbc); } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return percpu_counter_read(fbc); } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; } static inline void percpu_counter_sync(struct percpu_counter *fbc) { } #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) { percpu_counter_add(fbc, 1); } static inline void percpu_counter_dec(struct percpu_counter *fbc) { percpu_counter_add(fbc, -1); } static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) { percpu_counter_add(fbc, -amount); } static inline void percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_local(fbc, -amount); } #endif /* _LINUX_PERCPU_COUNTER_H */
8 8 8 8 8 8 8 8 8 8 8 16 17 17 17 17 8 8 8 1 1 1 1 1 1 1 1 1 17 17 17 17 17 17 16 1 16 17 17 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 // SPDX-License-Identifier: GPL-2.0-only /* * Scanning implementation * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "mesh.h" #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; cfg80211_put_bss(local->hw.wiphy, container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) { u8 qos_info; if (elems->wmm_info && elems->wmm_info_len == 7 && elems->wmm_info[5] == 1) qos_info = elems->wmm_info[6]; else if (elems->wmm_param && elems->wmm_param_len == 24 && elems->wmm_param[5] == 1) qos_info = elems->wmm_param[6]; else /* no valid wmm information or parameter element found */ return false; return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; } struct inform_bss_update_data { struct ieee80211_rx_status *rx_status; bool beacon; }; void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *cbss, const struct cfg80211_bss_ies *ies, void *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct inform_bss_update_data *update_data = data; struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; int clen, srlen; /* This happens while joining an IBSS */ if (!update_data) return; elems = ieee802_11_parse_elems(ies->data, ies->len, update_data->beacon ? IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, NULL); if (!elems) return; rx_status = update_data->rx_status; if (update_data->beacon) bss->device_ts_beacon = rx_status->device_timestamp; else bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (update_data->beacon) bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP; } else { if (update_data->beacon) bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP; } /* save the ERP value so that it is available at association time */ if (elems->erp_info && (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_ERP; } /* replace old supported rates if we get new values */ if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) { srlen = 0; if (elems->supp_rates) { clen = IEEE80211_MAX_SUPP_RATES; if (clen > elems->supp_rates_len) clen = elems->supp_rates_len; memcpy(bss->supp_rates, elems->supp_rates, clen); srlen += clen; } if (elems->ext_supp_rates) { clen = IEEE80211_MAX_SUPP_RATES - srlen; if (clen > elems->ext_supp_rates_len) clen = elems->ext_supp_rates_len; memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); srlen += clen; } if (srlen) { bss->supp_rates_len = srlen; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_RATES; } } if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) { bss->wmm_used = elems->wmm_param || elems->wmm_info; bss->uapsd_supported = is_uapsd_supported(elems); if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_WMM; } if (update_data->beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; if (!(rx_status->encoding == RX_ENC_HT) && !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } if (elems->vht_cap_elem) bss->vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); else bss->vht_cap_info = 0; kfree(elems); } struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel) { bool beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct inform_bss_update_data update_data = { .rx_status = rx_status, .beacon = beacon, }; struct cfg80211_inform_bss bss_meta = { .boottime_ns = rx_status->boottime_ns, .drv_data = (void *)&update_data, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) bss_meta.signal = 0; /* invalid signal indication */ else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.chan = channel; rcu_read_lock(); scan_sdata = rcu_dereference(local->scan_sdata); if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { struct ieee80211_bss_conf *link_conf = NULL; /* for an MLO connection, set the TSF data only in case we have * an indication on which of the links the frame was received */ if (ieee80211_vif_is_mld(&scan_sdata->vif)) { if (rx_status->link_valid) { s8 link_id = rx_status->link_id; link_conf = rcu_dereference(scan_sdata->vif.link_conf[link_id]); } } else { link_conf = &scan_sdata->vif.bss_conf; } if (link_conf) { bss_meta.parent_tsf = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); ether_addr_copy(bss_meta.parent_bssid, link_conf->bssid); } } rcu_read_unlock(); cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; if (!signal_valid) rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; return (void *)cbss->priv; } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { struct ieee80211_link_data *link_sdata; u8 link_id; if (!sdata) return false; /* accept broadcast on 6 GHz and for OCE */ if (is_broadcast_ether_addr(da) && (channel->band == NL80211_BAND_6GHZ || scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP)) return true; if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; if (ether_addr_equal(da, sdata->vif.addr)) return true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link_sdata = rcu_dereference(sdata->link[link_id]); if (!link_sdata) continue; if (ether_addr_equal(da, link_sdata->conf->addr)) return true; } return false; } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; struct ieee80211_ext *ext; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); if (!ieee80211_is_probe_resp(mgmt->frame_control) && !ieee80211_is_beacon(mgmt->frame_control) && !ieee80211_is_s1g_beacon(mgmt->frame_control)) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (struct ieee80211_ext *)mgmt; min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + ieee80211_s1g_optional_len(ext->frame_control); } if (skb->len < min_hdr_len) return; if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but * the beacon/proberesp rx gives us an opportunity to upgrade * to active scan */ set_bit(SCAN_BEACON_DONE, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct ieee80211_sub_if_data *sdata1, *sdata2; struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; sdata1 = rcu_dereference(local->scan_sdata); sdata2 = rcu_dereference(local->sched_scan_sdata); if (likely(!sdata1 && !sdata2)) return; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); if (scan_req) scan_req_flags = scan_req->flags; if (sched_scan_req) sched_scan_req_flags = sched_scan_req->flags; /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ if (!ieee80211_scan_accept_presp(sdata1, channel, scan_req_flags, mgmt->da) && !ieee80211_scan_accept_presp(sdata2, channel, sched_scan_req_flags, mgmt->da)) return; } else { /* * Non-S1G beacons are expected only with broadcast address. * S1G beacons only carry the SA so no DA check is required * nor possible. */ if (!ieee80211_is_s1g_beacon(mgmt->frame_control) && !is_broadcast_ether_addr(mgmt->da)) return; } /* Do not update the BSS table in case of only monitor interfaces */ if (local->open_count == local->monitors) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); if (bss) ieee80211_rx_bss_put(local, bss); } static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen; u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { local->hw_scan_req->req.n_channels = req->n_channels; for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = &local->hw_scan_req->req.n_channels; *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; } while (!*n_chans); } ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); if (ielen < 0) return false; local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * It's ok to abort a not-yet-running scan (that * we have one at all will be verified by checking * local->scan_req next), but not to complete it * successfully. */ if (WARN_ON(!local->scanning && !aborted)) aborted = true; if (WARN_ON(!local->scan_req)) return; scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), local->hw_scan_req); if (rc == 0) return; /* HW scan failed and is going to be reported as aborted, * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); aborted = true; } kfree(local->hw_scan_req); local->hw_scan_req = NULL; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; synchronize_rcu(); if (scan_req != local->int_scan_req) { local->scan_info.aborted = aborted; cfg80211_scan_done(scan_req, &local->scan_info); } /* Set power back to normal operating levels. */ ieee80211_hw_conf_chan(local); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); drv_sw_scan_complete(local, scan_sdata); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); /* Requeue all the work that might have been ignored while * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); memcpy(&local->scan_info, info, sizeof(*info)); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ if (!local->emulate_chanctx) return -EOPNOTSUPP; /* * Hardware/driver doesn't support hw_scan, so use software * scanning instead. First send a nullfunc frame with power save * bit on so that AP will buffer the frames for us while we are not * listening, then send probe requests to each channel and wait for * the responses. After all channels are scanned, tune back to the * original channel and send a nullfunc frame with power save bit * off to trigger the AP to send us all the buffered frames. * * Note that while local->sw_scanning is true everything else but * nullfunc frames and probe requests will be dropped in * ieee80211_tx_h_check_assoc(). */ drv_sw_scan_start(local, sdata, local->scan_addr); local->leave_oper_channel_time = jiffies; local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; ieee80211_offchannel_stop_vifs(local); /* ensure nullfunc is transmitted before leaving operating channel */ ieee80211_flush_queues(local, NULL, false); ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ ieee80211_hw_conf_chan(local); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); return 0; } static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_is_radar_required(local, req)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { for_each_valid_link(&sdata_iter->wdev, link_id) if (sdata_iter->wdev.links[link_id].cac_started) return false; } return true; } static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { if (!__ieee80211_can_leave_ch(sdata, req)) return false; if (!list_empty(&local->roc_list)) return false; if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL) return false; return true; } void ieee80211_run_deferred_scan(struct ieee80211_local *local) { struct cfg80211_scan_request *req; lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; req = wiphy_dereference(local->hw.wiphy, local->scan_req); if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), req)) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, round_jiffies_relative(0)); } static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses * on the channel. */ *next_delay = msecs_to_jiffies(scan_req->duration) > IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME ? msecs_to_jiffies(scan_req->duration) - IEEE80211_PROBE_DELAY : IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_wiphy(local->hw.wiphy); if (local->scan_req) return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it * is indeed active. */ if (ieee80211_vif_is_mld(&sdata->vif) && req->tsf_report_link_id >= 0 && !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) return -EINVAL; if (!__ieee80211_can_leave_ch(sdata, req)) return -EBUSY; if (!ieee80211_can_scan(local, sdata, req)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); return 0; } again: if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; for (i = 0; i < req->n_channels; i++) { if (bands_counted & BIT(req->channels[i]->band)) continue; bands_counted |= BIT(req->channels[i]->band); n_bands++; } local->hw_scan_ies_bufsize *= n_bands; } local->hw_scan_req = kmalloc(struct_size(local->hw_scan_req, req.channels, req->n_channels) + local->hw_scan_ies_bufsize, GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; /* None of the channels are actually set * up but let UBSAN know the boundaries. */ local->hw_scan_req->req.n_channels = req->n_channels; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; local->hw_scan_req->req.tsf_report_link_id = req->tsf_report_link_id; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; local->hw_scan_req->req.first_part = req->first_part; /* * After allocating local->hw_scan_req, we must * go through until ieee80211_prep_hw_scan(), so * anything that might be changed here and leave * this function early must not go after this * allocation. */ } rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(local->scan_addr, req->mac_addr, req->mac_addr_mask); else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->hw.conf.chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities */ unsigned long next_delay; __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); ieee80211_recalc_idle(local); /* Notify driver scan is starting, keep order of operations * same as normal software scan, in case that matters. */ drv_sw_scan_start(local, sdata, local->scan_addr); ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ ieee80211_hw_conf_chan(local); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; if (req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); } else { ieee80211_scan_state_send_probe(local, &next_delay); next_delay = IEEE80211_CHANNEL_TIME; } /* Now, just wait a bit and we are all done! */ wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return 0; } else { /* Do normal software scan */ __set_bit(SCAN_SW_SCANNING, &local->scanning); } ieee80211_recalc_idle(local); if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); } if (rc) { kfree(local->hw_scan_req); local->hw_scan_req = NULL; local->scanning = 0; ieee80211_recalc_idle(local); local->scan_req = NULL; RCU_INIT_POINTER(local->scan_sdata, NULL); } if (hw_scan && rc == 1) { /* * we can't fall back to software for P2P-GO * as it must update NoA etc. */ if (ieee80211_vif_type_p2p(&sdata->vif) == NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; hw_scan = false; goto again; } return rc; } static unsigned long ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) { /* * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } static void ieee80211_scan_state_decision(struct ieee80211_local *local, unsigned long *next_delay) { bool associated = false; bool tx_empty = true; bool bad_latency; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; struct cfg80211_scan_request *scan_req; lockdep_assert_wiphy(local->hw.wiphy); /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames * and grab the lowest used beacon interval */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.associated) { associated = true; if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; } } } } scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); next_chan = scan_req->channels[local->scan_channel_idx]; /* * we're currently scanning a different channel, let's * see if we can scan another channel without interfering * with the current traffic situation. * * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + ieee80211_scan_get_channel_time(next_chan), local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; } local->next_scan_state = next_scan_state; *next_delay = 0; } static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, unsigned long *next_delay) { int skip; struct ieee80211_channel *chan; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); skip = 0; chan = scan_req->channels[local->scan_channel_idx]; local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = NL80211_CHAN_WIDTH_1; local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } /* * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) local->scan_chandef = local->hw.conf.chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_conf_chan(local)) skip = 1; /* advance state machine to next channel/band */ local->scan_channel_idx++; if (skip) { /* if we skip this channel return to the decision state */ local->next_scan_state = SCAN_DECISION; return; } /* * Probe delay is used to update the NAV, cf. 11.1.3.2.2 * (which unfortunately doesn't say _why_ step a) is done, * but it waits for the probe delay or until a frame is * received - and the received frame would update the NAV). * For now, we do not support waiting until a frame is * received. * * In any case, it is not necessary for a passive scan. */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { *next_delay = max(msecs_to_jiffies(scan_req->duration), IEEE80211_PASSIVE_CHANNEL_TIME); local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); return; } /* active scan, send probes */ *next_delay = IEEE80211_PROBE_DELAY; local->next_scan_state = SCAN_SEND_PROBE; } static void ieee80211_scan_state_suspend(struct ieee80211_local *local, unsigned long *next_delay) { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; ieee80211_hw_conf_chan(local); /* disable PS */ ieee80211_offchannel_return(local); *next_delay = HZ / 5; /* afterwards, resume scan & go to next channel */ local->next_scan_state = SCAN_RESUME; } static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { ieee80211_flush_queues(local, NULL, false); *next_delay = 0; } else *next_delay = HZ / 10; /* remember when we left the operating channel */ local->leave_oper_channel_time = jiffies; /* advance to the next channel to be scanned */ local->next_scan_state = SCAN_SET_CHANNEL; } void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; unsigned long next_delay = 0; bool aborted; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_can_run_worker(local)) { aborted = true; goto out_complete; } sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); /* When scanning on-channel, the first-callback means completed. */ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (!sdata || !scan_req) return; if (!local->scanning) { int rc; RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, scan_req); if (!rc) return; /* need to complete scan in cfg80211 */ rcu_assign_pointer(local->scan_req, scan_req); aborted = true; goto out_complete; } clear_bit(SCAN_BEACON_WAIT, &local->scanning); /* * as long as no delay is required advance immediately * without scheduling a new work */ do { if (!ieee80211_sdata_running(sdata)) { aborted = true; goto out_complete; } if (test_and_clear_bit(SCAN_BEACON_DONE, &local->scanning) && local->next_scan_state == SCAN_DECISION) local->next_scan_state = SCAN_SEND_PROBE; switch (local->next_scan_state) { case SCAN_DECISION: /* if no more bands/channels left, complete scan */ if (local->scan_channel_idx >= scan_req->n_channels) { aborted = false; goto out_complete; } ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); break; case SCAN_SEND_PROBE: ieee80211_scan_state_send_probe(local, &next_delay); break; case SCAN_SUSPEND: ieee80211_scan_state_suspend(local, &next_delay); break; case SCAN_RESUME: ieee80211_scan_state_resume(local, &next_delay); break; case SCAN_ABORT: aborted = true; goto out_complete; } } while (next_delay == 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return; out_complete: __ieee80211_scan_completed(&local->hw, aborted); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { lockdep_assert_wiphy(sdata->local->hw.wiphy); return __ieee80211_start_scan(sdata, req); } int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) return -EBUSY; /* fill internal scan request */ if (!channels) { int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || band == NL80211_BAND_6GHZ || band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; for (i = 0; i < max_n; i++) { struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, tmp_ch)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; n_ch++; } } if (WARN_ON_ONCE(n_ch == 0)) return -EINVAL; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED) || !cfg80211_wdev_channel_allowed(&sdata->wdev, channels[i])) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } if (n_ch == 0) return -EINVAL; local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; return __ieee80211_start_scan(sdata, sdata->local->int_scan_req); } void ieee80211_scan_cancel(struct ieee80211_local *local) { /* ensure a new scan cannot be queued */ lockdep_assert_wiphy(local->hw.wiphy); /* * We are canceling software scan, or deferred scan that was not * yet really started (see __ieee80211_start_scan ). * * Regarding hardware scan: * - we can not call __ieee80211_scan_completed() as when * SCAN_HW_SCANNING bit is set this function change * local->hw_scan_req to operate on 5G band, what race with * driver which can use local->hw_scan_req * * - we can not cancel scan_work since driver can schedule it * by ieee80211_scan_completed(..., true) to finish scan * * Hence we only call the cancel_hw_scan() callback, but the low-level * driver is still responsible for calling ieee80211_scan_completed() * after the scan was completed/aborted. */ if (!local->scan_req) return; /* * We have a scan running and the driver already reported completion, * but the worker hasn't run yet or is stuck on the mutex - mark it as * cancelled. */ if (test_bit(SCAN_HW_SCANNING, &local->scanning) && test_bit(SCAN_COMPLETED, &local->scanning)) { set_bit(SCAN_HW_CANCELLED, &local->scanning); return; } if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { /* * Make sure that __ieee80211_scan_completed doesn't trigger a * scan on another band. */ set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx))); return; } wiphy_delayed_work_cancel(local->hw.wiphy, &local->scan_work); /* and clean up */ memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); } int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; u32 flags = 0; lockdep_assert_wiphy(local->hw.wiphy); iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) return -EOPNOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; num_bands++; } } if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; } ieee80211_prepare_scan_chandef(&chandef); ret = ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); if (ret < 0) goto error; ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); rcu_assign_pointer(local->sched_scan_req, req); } error: kfree(ie); out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); } return ret; } int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (rcu_access_pointer(local->sched_scan_sdata)) return -EBUSY; return __ieee80211_request_sched_scan_start(sdata, req); } int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sched_scan_sdata; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) return -EOPNOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata) { ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } return ret; } void ieee80211_sched_scan_results(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_results(local); cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); void ieee80211_sched_scan_end(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!rcu_access_pointer(local->sched_scan_sdata)) return; RCU_INIT_POINTER(local->sched_scan_sdata, NULL); /* If sched scan was aborted by the driver. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, sched_scan_stopped_work); ieee80211_sched_scan_end(local); } void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_stopped(local); /* * this shouldn't really happen, so for simplicity * simply ignore it, and let mac80211 reconfigure * the sched scan later on. */ if (local->in_reconfig) return; wiphy_work_queue(hw->wiphy, &local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
8 7 1 6 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 // SPDX-License-Identifier: GPL-2.0-or-later /* * authencesn.c - AEAD wrapper for IPsec with extended sequence numbers, * derived from authenc.c * * Copyright (C) 2010 secunet Security Networks AG * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/authenc.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> struct authenc_esn_instance_ctx { struct crypto_ahash_spawn auth; struct crypto_skcipher_spawn enc; }; struct crypto_authenc_esn_ctx { unsigned int reqoff; struct crypto_ahash *auth; struct crypto_skcipher *enc; }; struct authenc_esn_request_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; char tail[]; }; static void authenc_esn_request_complete(struct aead_request *req, int err) { if (err != -EINPROGRESS) aead_request_complete(req, err); } static int crypto_authenc_esn_setauthsize(struct crypto_aead *authenc_esn, unsigned int authsize) { if (authsize > 0 && authsize < 4) return -EINVAL; return 0; } static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 *key, unsigned int keylen) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; struct crypto_skcipher *enc = ctx->enc; struct crypto_authenc_keys keys; int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); if (err) goto out; crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); out: memzero_explicit(&keys, sizeof(keys)); return err; } static int crypto_authenc_esn_genicv_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); u8 *hash = areq_ctx->tail; unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); scatterwalk_map_and_copy(hash, dst, assoclen + cryptlen, authsize, 1); return 0; } static void authenc_esn_geniv_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_genicv_tail(req, 0); aead_request_complete(req, err); } static int crypto_authenc_esn_genicv(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *hash = areq_ctx->tail; struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; if (!authsize) return 0; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, hash, assoclen + cryptlen); ahash_request_set_callback(ahreq, flags, authenc_esn_geniv_ahash_done, req); return crypto_ahash_digest(ahreq) ?: crypto_authenc_esn_genicv_tail(req, aead_request_flags(req)); } static void crypto_authenc_esn_encrypt_done(void *data, int err) { struct aead_request *areq = data; if (!err) err = crypto_authenc_esn_genicv(areq, 0); authenc_esn_request_complete(areq, err); } static int crypto_authenc_esn_encrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_skcipher *enc = ctx->enc; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *src, *dst; int err; if (assoclen < 8) return -EINVAL; sg_init_table(areq_ctx->src, 2); src = scatterwalk_ffwd(areq_ctx->src, req->src, assoclen); dst = src; if (req->src != req->dst) { memcpy_sglist(req->dst, req->src, assoclen); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, assoclen); } skcipher_request_set_tfm(skreq, enc); skcipher_request_set_callback(skreq, aead_request_flags(req), crypto_authenc_esn_encrypt_done, req); skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; return crypto_authenc_esn_genicv(req, aead_request_flags(req)); } static int crypto_authenc_esn_decrypt_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int cryptlen = req->cryptlen - authsize; unsigned int assoclen = req->assoclen; struct scatterlist *src = req->src; struct scatterlist *dst = req->dst; u8 *ihash = ohash + crypto_ahash_digestsize(auth); u32 tmp[2]; if (!authsize) goto decrypt; if (src == dst) { /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); } else memcpy_sglist(dst, src, assoclen); if (crypto_memneq(ihash, ohash, authsize)) return -EBADMSG; decrypt: if (src != dst) src = scatterwalk_ffwd(areq_ctx->src, src, assoclen); dst = scatterwalk_ffwd(areq_ctx->dst, dst, assoclen); skcipher_request_set_tfm(skreq, ctx->enc); skcipher_request_set_callback(skreq, flags, req->base.complete, req->base.data); skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv); return crypto_skcipher_decrypt(skreq); } static void authenc_esn_verify_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_decrypt_tail(req, 0); authenc_esn_request_complete(req, err); } static int crypto_authenc_esn_decrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; u8 *ihash = ohash + crypto_ahash_digestsize(auth); struct scatterlist *src = req->src; struct scatterlist *dst = req->dst; u32 tmp[2]; int err; if (assoclen < 8) return -EINVAL; if (!authsize) goto tail; cryptlen -= authsize; scatterwalk_map_and_copy(ihash, req->src, assoclen + cryptlen, authsize, 0); /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, src, 0, 8, 0); if (src == dst) { scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); } else { scatterwalk_map_and_copy(tmp, dst, 0, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen - 4, 4, 1); src = scatterwalk_ffwd(areq_ctx->src, src, 8); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); memcpy_sglist(dst, src, assoclen + cryptlen - 8); dst = req->dst; } ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, ohash, assoclen + cryptlen); ahash_request_set_callback(ahreq, aead_request_flags(req), authenc_esn_verify_ahash_done, req); err = crypto_ahash_digest(ahreq); if (err) return err; tail: return crypto_authenc_esn_decrypt_tail(req, aead_request_flags(req)); } static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct authenc_esn_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *auth; struct crypto_skcipher *enc; int err; auth = crypto_spawn_ahash(&ictx->auth); if (IS_ERR(auth)) return PTR_ERR(auth); enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; ctx->auth = auth; ctx->enc = enc; ctx->reqoff = 2 * crypto_ahash_digestsize(auth); crypto_aead_set_reqsize( tfm, sizeof(struct authenc_esn_request_ctx) + ctx->reqoff + max_t(unsigned int, crypto_ahash_reqsize(auth) + sizeof(struct ahash_request), sizeof(struct skcipher_request) + crypto_skcipher_reqsize(enc))); return 0; err_free_ahash: crypto_free_ahash(auth); return err; } static void crypto_authenc_esn_exit_tfm(struct crypto_aead *tfm) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->auth); crypto_free_skcipher(ctx->enc); } static void crypto_authenc_esn_free(struct aead_instance *inst) { struct authenc_esn_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->enc); crypto_drop_ahash(&ctx->auth); kfree(inst); } static int crypto_authenc_esn_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct authenc_esn_instance_ctx *ctx; struct skcipher_alg_common *enc; struct hash_alg_common *auth; struct crypto_alg *auth_base; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; auth = crypto_spawn_ahash_alg(&ctx->auth); auth_base = &auth->base; err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst), crypto_attr_alg_name(tb[2]), 0, mask); if (err) goto err_free_inst; enc = crypto_spawn_skcipher_alg_common(&ctx->enc); err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_name, enc->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_driver_name, enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = enc->base.cra_priority * 10 + auth_base->cra_priority; inst->alg.base.cra_blocksize = enc->base.cra_blocksize; inst->alg.base.cra_alignmask = enc->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_esn_ctx); inst->alg.ivsize = enc->ivsize; inst->alg.chunksize = enc->chunksize; inst->alg.maxauthsize = auth->digestsize; inst->alg.init = crypto_authenc_esn_init_tfm; inst->alg.exit = crypto_authenc_esn_exit_tfm; inst->alg.setkey = crypto_authenc_esn_setkey; inst->alg.setauthsize = crypto_authenc_esn_setauthsize; inst->alg.encrypt = crypto_authenc_esn_encrypt; inst->alg.decrypt = crypto_authenc_esn_decrypt; inst->free = crypto_authenc_esn_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_authenc_esn_free(inst); } return err; } static struct crypto_template crypto_authenc_esn_tmpl = { .name = "authencesn", .create = crypto_authenc_esn_create, .module = THIS_MODULE, }; static int __init crypto_authenc_esn_module_init(void) { return crypto_register_template(&crypto_authenc_esn_tmpl); } static void __exit crypto_authenc_esn_module_exit(void) { crypto_unregister_template(&crypto_authenc_esn_tmpl); } module_init(crypto_authenc_esn_module_init); module_exit(crypto_authenc_esn_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("AEAD wrapper for IPsec with extended sequence numbers"); MODULE_ALIAS_CRYPTO("authencesn");
5 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef __LINUX_OVERFLOW_H #define __LINUX_OVERFLOW_H #include <linux/compiler.h> #include <linux/limits.h> #include <linux/const.h> /* * We need to compute the minimum and maximum values representable in a given * type. These macros may also be useful elsewhere. It would seem more obvious * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) * * Unfortunately, the middle expressions, strictly speaking, have * undefined behaviour, and at least some versions of gcc warn about * the type_max expression (but not if -fsanitize=undefined is in * effect; in that case, the warning is deferred to runtime...). * * The slightly excessive casting in type_min is to make sure the * macros also produce sensible values for the exotic type _Bool. [The * overflow checkers only almost work for _Bool, but that's * a-feature-not-a-bug, since people shouldn't be doing arithmetic on * _Bools. Besides, the gcc builtins don't allow _Bool* as third * argument.] * * Idea stolen from * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_max(t) __type_max(typeof(t)) #define __type_min(T) ((T)((T)-type_max(T)-(T)1)) #define type_min(t) __type_min(typeof(t)) /* * Allows for effectively applying __must_check to a macro so we can have * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ static __always_inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } /** * check_add_overflow() - Calculate addition with overflow checking * @a: first addend * @b: second addend * @d: pointer to store sum * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted addition, regardless of whether * wrap-around occurred. */ #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) /** * wrapping_add() - Intentionally perform a wrapping addition * @type: type for result of calculation * @a: first addend * @b: second addend * * Return the potentially wrapped-around addition without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_add(type, a, b) \ ({ \ type __val; \ __builtin_add_overflow(a, b, &__val); \ __val; \ }) /** * wrapping_assign_add() - Intentionally perform a wrapping increment assignment * @var: variable to be incremented * @offset: amount to add * * Increments @var by @offset with wrap-around. Returns the resulting * value of @var. Will not trip any wrap-around sanitizers. * * Returns the new value of @var. */ #define wrapping_assign_add(var, offset) \ ({ \ typeof(var) *__ptr = &(var); \ *__ptr = wrapping_add(typeof(var), *__ptr, offset); \ }) /** * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * @d: pointer to store difference * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted subtraction, regardless of whether * wrap-around occurred. */ #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) /** * wrapping_sub() - Intentionally perform a wrapping subtraction * @type: type for result of calculation * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * * Return the potentially wrapped-around subtraction without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_sub(type, a, b) \ ({ \ type __val; \ __builtin_sub_overflow(a, b, &__val); \ __val; \ }) /** * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign * @var: variable to be decremented * @offset: amount to subtract * * Decrements @var by @offset with wrap-around. Returns the resulting * value of @var. Will not trip any wrap-around sanitizers. * * Returns the new value of @var. */ #define wrapping_assign_sub(var, offset) \ ({ \ typeof(var) *__ptr = &(var); \ *__ptr = wrapping_sub(typeof(var), *__ptr, offset); \ }) /** * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor * @b: second factor * @d: pointer to store product * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted multiplication, regardless of whether * wrap-around occurred. */ #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) /** * wrapping_mul() - Intentionally perform a wrapping multiplication * @type: type for result of calculation * @a: first factor * @b: second factor * * Return the potentially wrapped-around multiplication without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_mul(type, a, b) \ ({ \ type __val; \ __builtin_mul_overflow(a, b, &__val); \ __val; \ }) /** * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't * make sense. Example conditions: * * - '@a << @s' causes bits to be lost when stored in *@d. * - '@s' is garbage (e.g. negative) or so large that the result of * '@a << @s' is guaranteed to be 0. * - '@a' is negative. * - '@a << @s' sets the sign bit, if any, in '*@d'. * * '*@d' will hold the results of the attempted shift, but is not * considered "safe for use" if true is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ unsigned long long _a_full = _a; \ unsigned int _to_shift = \ _s >= 0 && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ (_to_shift != _s || *_d < 0 || _a < 0 || \ (*_d >> _to_shift) != _a); \ })) #define __overflows_type_constexpr(x, T) ( \ is_unsigned_type(typeof(x)) ? \ (x) > type_max(T) : \ is_unsigned_type(typeof(T)) ? \ (x) < 0 || (x) > type_max(T) : \ (x) < type_min(T) || (x) > type_max(T)) #define __overflows_type(x, T) ({ \ typeof(T) v = 0; \ check_add_overflow((x), v, &v); \ }) /** * overflows_type - helper for checking the overflows between value, variables, * or data type * * @n: source constant value or variable to be checked * @T: destination variable or data type proposed to store @x * * Compares the @x expression for whether or not it can safely fit in * the storage of the type in @T. @x and @T can have different types. * If @x is a constant expression, this will also resolve to a constant * expression. * * Returns: true if overflow can occur, false otherwise. */ #define overflows_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ __overflows_type_constexpr(n, T), \ __overflows_type(n, T)) /** * range_overflows() - Check if a range is out of bounds * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * A strict check to determine if the range [@start, @start + @size) is * invalid with respect to the allowable range [0, @max). Any range * starting at or beyond @max is considered an overflow, even if @size is 0. * * Returns: true if the range is out of bounds. */ #define range_overflows(start, size, max) ({ \ typeof(start) start__ = (start); \ typeof(size) size__ = (size); \ typeof(max) max__ = (max); \ (void)(&start__ == &size__); \ (void)(&start__ == &max__); \ start__ >= max__ || size__ > max__ - start__; \ }) /** * range_overflows_t() - Check if a range is out of bounds * @type: Data type to use. * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * Same as range_overflows() but forcing the parameters to @type. * * Returns: true if the range is out of bounds. */ #define range_overflows_t(type, start, size, max) \ range_overflows((type)(start), (type)(size), (type)(max)) /** * range_end_overflows() - Check if a range's endpoint is out of bounds * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * Checks only if the endpoint of a range (@start + @size) exceeds @max. * Unlike range_overflows(), a zero-sized range at the boundary (@start == @max) * is not considered an overflow. Useful for iterator-style checks. * * Returns: true if the endpoint exceeds the boundary. */ #define range_end_overflows(start, size, max) ({ \ typeof(start) start__ = (start); \ typeof(size) size__ = (size); \ typeof(max) max__ = (max); \ (void)(&start__ == &size__); \ (void)(&start__ == &max__); \ start__ > max__ || size__ > max__ - start__; \ }) /** * range_end_overflows_t() - Check if a range's endpoint is out of bounds * @type: Data type to use. * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * Same as range_end_overflows() but forcing the parameters to @type. * * Returns: true if the endpoint exceeds the boundary. */ #define range_end_overflows_t(type, start, size, max) \ range_end_overflows((type)(start), (type)(size), (type)(max)) /** * castable_to_type - like __same_type(), but also allows for casted literals * * @n: variable or constant value * @T: variable or data type * * Unlike the __same_type() macro, this allows a constant value as the * first argument. If this value would not overflow into an assignment * of the second argument's type, it returns true. Otherwise, this falls * back to __same_type(). */ #define castable_to_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ !__overflows_type_constexpr(n, T), \ __same_type(n, T)) /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX * @factor1: first factor * @factor2: second factor * * Returns: calculate @factor1 * @factor2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static __always_inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; if (check_mul_overflow(factor1, factor2, &bytes)) return SIZE_MAX; return bytes; } /** * size_add() - Calculate size_t addition with saturation at SIZE_MAX * @addend1: first addend * @addend2: second addend * * Returns: calculate @addend1 + @addend2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static __always_inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; if (check_add_overflow(addend1, addend2, &bytes)) return SIZE_MAX; return bytes; } /** * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX * @minuend: value to subtract from * @subtrahend: value to subtract from @minuend * * Returns: calculate @minuend - @subtrahend, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. For * composition with the size_add() and size_mul() helpers, neither * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ static __always_inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; if (minuend == SIZE_MAX || subtrahend == SIZE_MAX || check_sub_overflow(minuend, subtrahend, &bytes)) return SIZE_MAX; return bytes; } /** * array_size() - Calculate size of 2-dimensional array. * @a: dimension one * @b: dimension two * * Calculates size of 2-dimensional array: @a * @b. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array_size(a, b) size_mul(a, b) /** * array3_size() - Calculate size of 3-dimensional array. * @a: dimension one * @b: dimension two * @c: dimension three * * Calculates size of 3-dimensional array: @a * @b * @c. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array3_size(a, b, c) size_mul(size_mul(a, b), c) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. * * Calculates size of a flexible array of @count number of @member * elements, at the end of structure @p. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ (count) * sizeof(*(p)->member) + __must_be_array((p)->member), \ size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member))) /** * struct_size() - Calculate size of structure with trailing flexible array. * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure of @p followed by an * array of @count number of @member elements. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ sizeof(*(p)) + flex_array_size(p, member, count), \ size_add(sizeof(*(p)), flex_array_size(p, member, count))) /** * struct_size_t() - Calculate size of structure with trailing flexible array * @type: structure type name. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure @type followed by an * array of @count number of @member elements. Prefer using struct_size() * when possible instead, to keep calculations associated with a specific * instance variable of type @type. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size_t(type, member, count) \ struct_size((type *)NULL, member, count) /** * struct_offset() - Calculate the offset of a member within a struct * @p: Pointer to the struct * @member: Name of the member to get the offset of * * Calculates the offset of a particular @member of the structure pointed * to by @p. * * Return: number of bytes to the location of @member. */ #define struct_offset(p, member) (offsetof(typeof(*(p)), member)) /** * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass arbitrary trailing expressions * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * @trailer: Trailing expressions for attributes and/or initializers. */ #define __DEFINE_FLEX(type, name, member, count, trailer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ u8 bytes[struct_size_t(type, member, count)]; \ type obj; \ } name##_u trailer; \ type *name = (type *)&name##_u /** * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass (different) initializer. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * @initializer: Initializer expression (e.g., pass `= { }` at minimum). */ #define _DEFINE_FLEX(type, name, member, count, initializer...) \ __DEFINE_FLEX(type, name, member, count, = { .obj initializer }) /** * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member, when it does not have a __counted_by annotation. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * * Define a zeroed, on-stack, instance of @type structure with a trailing * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. * Use __member_size(@name->member) to get compile-time size of @name members. * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of * elements in array @member. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ __DEFINE_FLEX(type, name, member, count, = { }) /** * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member. * * @TYPE: structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @MEMBER: Name of the array member. * @COUNTER: Name of the __counted_by member. * @COUNT: Number of elements in the array; must be compile-time const. * * Define a zeroed, on-stack, instance of @TYPE structure with a trailing * flexible array member. * Use __struct_size(@NAME) to get compile-time size of it afterwards. * Use __member_size(@NAME->member) to get compile-time size of @NAME members. * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of * elements in array @member. */ #define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .COUNTER = COUNT, }) /** * STACK_FLEX_ARRAY_SIZE() - helper macro for DEFINE_FLEX() family. * Returns the number of elements in @array. * * @name: Name for a variable defined in DEFINE_RAW_FLEX()/DEFINE_FLEX(). * @array: Name of the array member. */ #define STACK_FLEX_ARRAY_SIZE(name, array) \ (__member_size((name)->array) / sizeof(*(name)->array) + \ __must_be_array((name)->array)) /** * typeof_flex_counter() - Return the type of the counter variable of a given * flexible array member annotated by __counted_by(). * @FAM: Instance of flexible array member within a given struct. * * Returns: "size_t" if no annotation exists. */ #define typeof_flex_counter(FAM) \ typeof(_Generic(__flex_counter(FAM), \ void *: (size_t)0, \ default: *__flex_counter(FAM))) /** * overflows_flex_counter_type() - Check if the counter associated with the * given flexible array member can represent * a value. * @TYPE: Type of the struct that contains the @FAM. * @FAM: Member name of the FAM within @TYPE. * @COUNT: Value to check against the __counted_by annotated @FAM's counter. * * Returns: true if @COUNT can be represented in the @FAM's counter. When * @FAM is not annotated with __counted_by(), always returns true. */ #define overflows_flex_counter_type(TYPE, FAM, COUNT) \ (overflows_type(COUNT, typeof_flex_counter(((TYPE *)NULL)->FAM))) /** * __set_flex_counter() - Set the counter associated with the given flexible * array member that has been annoated by __counted_by(). * @FAM: Instance of flexible array member within a given struct. * @COUNT: Value to store to the __counted_by annotated @FAM_PTR's counter. * * This is a no-op if no annotation exists. Count needs to be checked with * overflows_flex_counter_type() before using this function. */ #define __set_flex_counter(FAM, COUNT) \ ({ \ *_Generic(__flex_counter(FAM), \ void *: &(size_t){ 0 }, \ default: __flex_counter(FAM)) = (COUNT); \ }) #endif /* __LINUX_OVERFLOW_H */
8 7 7 4 6 1 14 15 15 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * linux/can/skb.h * * Definitions for the CAN network socket buffer * * Copyright (C) 2012 Oliver Hartkopp <socketcan@hartkopp.net> * */ #ifndef _CAN_SKB_H #define _CAN_SKB_H #include <linux/types.h> #include <linux/skbuff.h> #include <linux/can.h> #include <net/can.h> #include <net/sock.h> void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr); unsigned int __must_check can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf); bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb); static inline struct can_skb_ext *can_skb_ext_add(struct sk_buff *skb) { struct can_skb_ext *csx = skb_ext_add(skb, SKB_EXT_CAN); /* skb_ext_add() returns uninitialized space */ if (csx) csx->can_gw_hops = 0; return csx; } static inline struct can_skb_ext *can_skb_ext_find(struct sk_buff *skb) { return skb_ext_find(skb, SKB_EXT_CAN); } static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) { /* If the socket has already been closed by user space, the * refcount may already be 0 (and the socket will be freed * after the last TX skb has been freed). So only increase * socket refcount if the refcount is > 0. */ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb->destructor = sock_efree; skb->sk = sk; } } /* * returns an unshared skb owned by the original sock to be echo'ed back */ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) { struct sk_buff *nskb; nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { kfree_skb(skb); return NULL; } can_skb_set_owner(nskb, skb->sk); consume_skb(skb); return nskb; } static inline bool can_is_can_skb(const struct sk_buff *skb) { struct can_frame *cf = (struct can_frame *)skb->data; /* the CAN specific type of skb is identified by its data length */ return (skb->len == CAN_MTU && cf->len <= CAN_MAX_DLEN); } static inline bool can_is_canfd_skb(const struct sk_buff *skb) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; /* the CAN specific type of skb is identified by its data length */ return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } static inline bool can_is_canxl_skb(const struct sk_buff *skb) { const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU) return false; /* this also checks valid CAN XL data length boundaries */ if (skb->len != CANXL_HDR_SIZE + cxl->len) return false; return cxl->flags & CANXL_XLF; } /* get length element value from can[|fd|xl]_frame structure */ static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) { const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; if (can_is_canxl_skb(skb)) return cxl->len; return cfd->len; } /* get needed data length inside CAN frame for all frame types (RTR aware) */ static inline unsigned int can_skb_get_data_len(struct sk_buff *skb) { unsigned int len = can_skb_get_len_val(skb); const struct can_frame *cf = (struct can_frame *)skb->data; /* RTR frames have an actual length of zero */ if (can_is_can_skb(skb) && cf->can_id & CAN_RTR_FLAG) return 0; return len; } #endif /* !_CAN_SKB_H */
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Common Blowfish algorithm parts shared between the c and assembler * implementations. * * Blowfish Cipher Algorithm, by Bruce Schneier. * http://www.counterpane.com/blowfish.html * * Adapted from Kerneli implementation. * * Copyright (c) Herbert Valerio Riedel <hvr@hvrlab.org> * Copyright (c) Kyle McMartin <kyle@debian.org> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> */ #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <linux/types.h> #include <crypto/blowfish.h> static const u32 bf_pbox[16 + 2] = { 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, 0x9216d5d9, 0x8979fb1b, }; static const u32 bf_sbox[256 * 4] = { 0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, 0xb8e1afed, 0x6a267e96, 0xba7c9045, 0xf12c7f99, 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16, 0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, 0x0d95748f, 0x728eb658, 0x718bcd58, 0x82154aee, 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013, 0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, 0x8e79dcb0, 0x603a180e, 0x6c9e0e8b, 0xb01e8a3e, 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60, 0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, 0x55ca396a, 0x2aab10b6, 0xb4cc5c34, 0x1141e8ce, 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a, 0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, 0xafd6ba33, 0x6c24cf5c, 0x7a325381, 0x28958677, 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193, 0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, 0xef845d5d, 0xe98575b1, 0xdc262302, 0xeb651b88, 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239, 0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, 0x21c66842, 0xf6e96c9a, 0x670c9c61, 0xabd388f0, 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3, 0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, 0xa1f1651d, 0x39af0176, 0x66ca593e, 0x82430e88, 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe, 0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, 0x4ed3aa62, 0x363f7706, 0x1bfedf72, 0x429b023d, 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b, 0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, 0xe3fe501a, 0xb6794c3b, 0x976ce0bd, 0x04c006ba, 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463, 0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, 0x6dfc511f, 0x9b30952c, 0xcc814544, 0xaf5ebd09, 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3, 0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, 0x5579c0bd, 0x1a60320a, 0xd6a100c6, 0x402c7279, 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8, 0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, 0x323db5fa, 0xfd238760, 0x53317b48, 0x3e00df82, 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db, 0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, 0x695b27b0, 0xbbca58c8, 0xe1ffa35d, 0xb8f011a0, 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b, 0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, 0xe1ddf2da, 0xa4cb7e33, 0x62fb1341, 0xcee4c6e8, 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4, 0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, 0xd08ed1d0, 0xafc725e0, 0x8e3c5b2f, 0x8e7594b7, 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c, 0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, 0x2f2f2218, 0xbe0e1777, 0xea752dfe, 0x8b021fa1, 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299, 0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, 0x165fa266, 0x80957705, 0x93cc7314, 0x211a1477, 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf, 0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, 0x00250e2d, 0x2071b35e, 0x226800bb, 0x57b8e0af, 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa, 0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, 0x83260376, 0x6295cfa9, 0x11c81968, 0x4e734a41, 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915, 0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, 0x08ba6fb5, 0x571be91f, 0xf296ec6b, 0x2a0dd915, 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664, 0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a, 0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, 0xad6ea6b0, 0x49a7df7d, 0x9cee60b8, 0x8fedb266, 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1, 0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, 0x3f54989a, 0x5b429d65, 0x6b8fe4d6, 0x99f73fd6, 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1, 0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, 0x09686b3f, 0x3ebaefc9, 0x3c971814, 0x6b6a70a1, 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737, 0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, 0xb03ada37, 0xf0500c0d, 0xf01c1f04, 0x0200b3ff, 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd, 0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, 0x3ae5e581, 0x37c2dadc, 0xc8b57634, 0x9af3dda7, 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41, 0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, 0x4e548b38, 0x4f6db908, 0x6f420d03, 0xf60a04bf, 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af, 0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, 0x5512721f, 0x2e6b7124, 0x501adde6, 0x9f84cd87, 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c, 0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, 0xef1c1847, 0x3215d908, 0xdd433b37, 0x24c2ba16, 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd, 0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, 0x043556f1, 0xd7a3c76b, 0x3c11183b, 0x5924a509, 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e, 0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, 0x771fe71c, 0x4e3d06fa, 0x2965dcb9, 0x99e71d0f, 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a, 0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, 0xf2f74ea7, 0x361d2b3d, 0x1939260f, 0x19c27960, 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66, 0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, 0xc332ddef, 0xbe6c5aa5, 0x65582185, 0x68ab9802, 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84, 0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, 0x13cca830, 0xeb61bd96, 0x0334fe1e, 0xaa0363cf, 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14, 0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, 0x648b1eaf, 0x19bdf0ca, 0xa02369b9, 0x655abb50, 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7, 0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, 0xf837889a, 0x97e32d77, 0x11ed935f, 0x16681281, 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99, 0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, 0xcdb30aeb, 0x532e3054, 0x8fd948e4, 0x6dbc3128, 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73, 0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, 0x45eee2b6, 0xa3aaabea, 0xdb6c4f15, 0xfacb4fd0, 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105, 0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, 0xcf62a1f2, 0x5b8d2646, 0xfc8883a0, 0xc1c7b6a3, 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285, 0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, 0x58428d2a, 0x0c55f5ea, 0x1dadf43e, 0x233f7061, 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb, 0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, 0xa6078084, 0x19f8509e, 0xe8efd855, 0x61d99735, 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc, 0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, 0xdb73dbd3, 0x105588cd, 0x675fda79, 0xe3674340, 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20, 0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7, 0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, 0x411520f7, 0x7602d4f7, 0xbcf46b2e, 0xd4a20068, 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af, 0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, 0x4d95fc1d, 0x96b591af, 0x70f4ddd3, 0x66a02f45, 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504, 0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, 0x28507825, 0x530429f4, 0x0a2c86da, 0xe9b66dfb, 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee, 0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, 0xaace1e7c, 0xd3375fec, 0xce78a399, 0x406b2a42, 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b, 0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, 0x3a6efa74, 0xdd5b4332, 0x6841e7f7, 0xca7820fb, 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527, 0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, 0x55a867bc, 0xa1159a58, 0xcca92963, 0x99e1db33, 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c, 0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, 0x95c11548, 0xe4c66d22, 0x48c1133f, 0xc70f86dc, 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17, 0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, 0x257b7834, 0x602a9c60, 0xdff8e8a3, 0x1f636c1b, 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115, 0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, 0x85b2a20e, 0xe6ba0d99, 0xde720c8c, 0x2da2f728, 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0, 0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, 0x0a476341, 0x992eff74, 0x3a6f6eab, 0xf4f8fd37, 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d, 0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, 0xf1290dc7, 0xcc00ffa3, 0xb5390f92, 0x690fed0b, 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3, 0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, 0x37392eb3, 0xcc115979, 0x8026e297, 0xf42e312d, 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c, 0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, 0x1a6b1018, 0x11caedfa, 0x3d25bdd8, 0xe2e1c3c9, 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a, 0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, 0x9dbc8057, 0xf0f7c086, 0x60787bf8, 0x6003604d, 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc, 0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, 0x77a057be, 0xbde8ae24, 0x55464299, 0xbf582e61, 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2, 0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, 0x7aeb2661, 0x8b1ddf84, 0x846a0e79, 0x915f95e2, 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c, 0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, 0xb77f19b6, 0xe0a9dc09, 0x662d09a1, 0xc4324633, 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10, 0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, 0xdcb7da83, 0x573906fe, 0xa1e2ce9b, 0x4fcd7f52, 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027, 0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, 0xf0177a28, 0xc0f586e0, 0x006058aa, 0x30dc7d62, 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634, 0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, 0x6f05e409, 0x4b7c0188, 0x39720a3d, 0x7c927c24, 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc, 0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, 0x1e50ef5e, 0xb161e6f8, 0xa28514d9, 0x6c51133c, 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837, 0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0, 0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, 0x5cb0679e, 0x4fa33742, 0xd3822740, 0x99bc9bbe, 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b, 0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, 0x5748ab2f, 0xbc946e79, 0xc6a376d2, 0x6549c2c8, 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6, 0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, 0xa1fad5f0, 0x6a2d519a, 0x63ef8ce2, 0x9a86ee22, 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4, 0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, 0x2826a2f9, 0xa73a3ae1, 0x4ba99586, 0xef5562e9, 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59, 0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, 0xe990fd5a, 0x9e34d797, 0x2cf0b7d9, 0x022b8b51, 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28, 0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, 0xe029ac71, 0xe019a5e6, 0x47b0acfd, 0xed93fa9b, 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28, 0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, 0x15056dd4, 0x88f46dba, 0x03a16125, 0x0564f0bd, 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a, 0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, 0x7533d928, 0xb155fdf5, 0x03563482, 0x8aba3cbb, 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f, 0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, 0xea7a90c2, 0xfb3e7bce, 0x5121ce64, 0x774fbe32, 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680, 0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, 0xb39a460a, 0x6445c0dd, 0x586cdecf, 0x1c20c8ae, 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb, 0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, 0x72eacea8, 0xfa6484bb, 0x8d6612ae, 0xbf3c6f47, 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370, 0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, 0x4040cb08, 0x4eb4e2cc, 0x34d2466a, 0x0115af84, 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048, 0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, 0x611560b1, 0xe7933fdc, 0xbb3a792b, 0x344525bd, 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9, 0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, 0x1a908749, 0xd44fbd9a, 0xd0dadecb, 0xd50ada38, 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f, 0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, 0xbf97222c, 0x15e6fc2a, 0x0f91fc71, 0x9b941525, 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1, 0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, 0xe0ec6e0e, 0x1698db3b, 0x4c98a0be, 0x3278e964, 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e, 0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, 0xdf359f8d, 0x9b992f2e, 0xe60b6f47, 0x0fe3f11d, 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f, 0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, 0xf523f357, 0xa6327623, 0x93a83531, 0x56cccd02, 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc, 0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, 0xe6c6c7bd, 0x327a140a, 0x45e1d006, 0xc3f27b9a, 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6, 0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, 0x53113ec0, 0x1640e3d3, 0x38abbd60, 0x2547adf0, 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060, 0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, 0x1948c25c, 0x02fb8a8c, 0x01c36ae4, 0xd6ebe1f9, 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f, 0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6, }; /* * Round loop unrolling macros, S is a pointer to a S-Box array * organized in 4 unsigned longs at a row. */ #define GET32_3(x) (((x) & 0xff)) #define GET32_2(x) (((x) >> (8)) & (0xff)) #define GET32_1(x) (((x) >> (16)) & (0xff)) #define GET32_0(x) (((x) >> (24)) & (0xff)) #define bf_F(x) (((S[GET32_0(x)] + S[256 + GET32_1(x)]) ^ \ S[512 + GET32_2(x)]) + S[768 + GET32_3(x)]) #define ROUND(a, b, n) ({ b ^= P[n]; a ^= bf_F(b); }) /* * The blowfish encipher, processes 64-bit blocks. * NOTE: This function MUSTN'T respect endianness */ static void encrypt_block(struct bf_ctx *bctx, u32 *dst, u32 *src) { const u32 *P = bctx->p; const u32 *S = bctx->s; u32 yl = src[0]; u32 yr = src[1]; ROUND(yr, yl, 0); ROUND(yl, yr, 1); ROUND(yr, yl, 2); ROUND(yl, yr, 3); ROUND(yr, yl, 4); ROUND(yl, yr, 5); ROUND(yr, yl, 6); ROUND(yl, yr, 7); ROUND(yr, yl, 8); ROUND(yl, yr, 9); ROUND(yr, yl, 10); ROUND(yl, yr, 11); ROUND(yr, yl, 12); ROUND(yl, yr, 13); ROUND(yr, yl, 14); ROUND(yl, yr, 15); yl ^= P[16]; yr ^= P[17]; dst[0] = yr; dst[1] = yl; } /* * Calculates the blowfish S and P boxes for encryption and decryption. */ int blowfish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct bf_ctx *ctx = crypto_tfm_ctx(tfm); u32 *P = ctx->p; u32 *S = ctx->s; short i, j, count; u32 data[2], temp; /* Copy the initialization s-boxes */ for (i = 0, count = 0; i < 256; i++) for (j = 0; j < 4; j++, count++) S[count] = bf_sbox[count]; /* Set the p-boxes */ for (i = 0; i < 16 + 2; i++) P[i] = bf_pbox[i]; /* Actual subkey generation */ for (j = 0, i = 0; i < 16 + 2; i++) { temp = (((u32)key[j] << 24) | ((u32)key[(j + 1) % keylen] << 16) | ((u32)key[(j + 2) % keylen] << 8) | ((u32)key[(j + 3) % keylen])); P[i] = P[i] ^ temp; j = (j + 4) % keylen; } data[0] = 0x00000000; data[1] = 0x00000000; for (i = 0; i < 16 + 2; i += 2) { encrypt_block((struct bf_ctx *)ctx, data, data); P[i] = data[0]; P[i + 1] = data[1]; } for (i = 0; i < 4; i++) { for (j = 0, count = i * 256; j < 256; j += 2, count += 2) { encrypt_block((struct bf_ctx *)ctx, data, data); S[count] = data[0]; S[count + 1] = data[1]; } } /* Bruce says not to bother with the weak key check. */ return 0; } EXPORT_SYMBOL_GPL(blowfish_setkey); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Blowfish Cipher common functions");
71 71 71 15 15 15 1 1 8 9 18 13 13 9 2 8 11 11 11 14 12 4 2 11 14 14 56 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET6 transport hashtables * * Authors: Lotsa people, from code originally in tcp, generalised here * by Arnaldo Carvalho de Melo <acme@mandriva.com> */ #include <linux/module.h> #include <linux/random.h> #include <net/addrconf.h> #include <net/hotdata.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> #include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { u32 lhash, fhash; net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); return lport + __inet6_ehashfn(lhash, 0, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif) { const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const struct hlist_nulls_node *node; struct inet_ehash_bucket *head; struct inet_hashinfo *hashinfo; unsigned int hash, slot; struct sock *sk; hashinfo = net->ipv4.tcp_death_row.hashinfo; hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); slot = hash & hashinfo->ehash_mask; head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && READ_ONCE(inet_sk(sk)->inet_num) == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. * @net: network namespace. * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet6_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); if (result) goto done; } hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with in6addr_any */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, &in6addr_any, hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp, bool rcu_lookup, u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct inet_timewait_sock *tw = NULL; const struct hlist_nulls_node *node; struct sock *sk2; spinlock_t *lock; if (rcu_lookup) { sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash || !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif)) continue; if (sk2->sk_state == TCP_TIME_WAIT) break; return -EADDRNOTAVAIL; } return 0; } lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_dport); } int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const struct inet_sock *inet = inet_sk(sk); const struct net *net = sock_net(sk); u64 port_offset = 0; u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect);
5 2 2 2 1 5 1 1 1 1 1 1 1 1 15 15 15 15 15 7 9 21 21 21 21 21 14 4 1 1 1 7 7 2 5 2 5 5 7 5 18 1 1 2 2 2 12 11 3 21 21 21 21 21 21 21 21 23 23 12 20 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_gred.c Generic Random Early Detection queue. * * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 * * 991129: - Bug fix with grio mode * - a better sing. AvgQ mode with Grio(WRED) * - A finer grained VQ dequeue based on suggestion * from Ren Liu * - More error checks * * For all the glorious comments look at include/net/red.h */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/red.h> #define GRED_DEF_PRIO (MAX_DPs / 2) #define GRED_VQ_MASK (MAX_DPs - 1) #define GRED_VQ_RED_FLAGS (TC_RED_ECN | TC_RED_HARDDROP) struct gred_sched_data; struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ u32 DP; /* the drop parameters */ u32 red_flags; /* virtualQ version of red_flags */ u64 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ u8 prio; /* the prio of this vq */ struct red_parms parms; struct red_vars vars; struct red_stats stats; }; enum { GRED_WRED_MODE = 1, GRED_RIO_MODE, }; struct gred_sched { struct gred_sched_data *tab[MAX_DPs]; unsigned long flags; u32 red_flags; u32 DPs; u32 def; struct red_vars wred_set; struct tc_gred_qopt_offload *opt; }; static inline int gred_wred_mode(struct gred_sched *table) { return test_bit(GRED_WRED_MODE, &table->flags); } static inline void gred_enable_wred_mode(struct gred_sched *table) { __set_bit(GRED_WRED_MODE, &table->flags); } static inline void gred_disable_wred_mode(struct gred_sched *table) { __clear_bit(GRED_WRED_MODE, &table->flags); } static inline int gred_rio_mode(struct gred_sched *table) { return test_bit(GRED_RIO_MODE, &table->flags); } static inline void gred_enable_rio_mode(struct gred_sched *table) { __set_bit(GRED_RIO_MODE, &table->flags); } static inline void gred_disable_rio_mode(struct gred_sched *table) { __clear_bit(GRED_RIO_MODE, &table->flags); } static inline int gred_wred_mode_check(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); int i; /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; int n; if (q == NULL) continue; for (n = i + 1; n < table->DPs; n++) if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } return 0; } static inline unsigned int gred_backlog(struct gred_sched *table, struct gred_sched_data *q, struct Qdisc *sch) { if (gred_wred_mode(table)) return sch->qstats.backlog; else return q->backlog; } static inline u16 tc_index_to_dp(struct sk_buff *skb) { return skb->tc_index & GRED_VQ_MASK; } static inline void gred_load_wred_set(const struct gred_sched *table, struct gred_sched_data *q) { q->vars.qavg = table->wred_set.qavg; q->vars.qidlestart = table->wred_set.qidlestart; } static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { table->wred_set.qavg = q->vars.qavg; table->wred_set.qidlestart = q->vars.qidlestart; } static int gred_use_ecn(struct gred_sched_data *q) { return q->red_flags & TC_RED_ECN; } static int gred_use_harddrop(struct gred_sched_data *q) { return q->red_flags & TC_RED_HARDDROP; } static bool gred_per_vq_red_flags_used(struct gred_sched *table) { unsigned int i; /* Local per-vq flags couldn't have been set unless global are 0 */ if (table->red_flags) return false; for (i = 0; i < MAX_DPs; i++) if (table->tab[i] && table->tab[i]->red_flags) return true; return false; } static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct gred_sched_data *q = NULL; struct gred_sched *t = qdisc_priv(sch); unsigned long qavg = 0; u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { dp = t->def; q = t->tab[dp]; if (!q) { /* Pass through packets not assigned to a DP * if no default DP has been configured. This * allows for DP flows to be left untouched. */ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; } /* fix tc_index? --could be controversial but needed for requeueing */ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; for (i = 0; i < t->DPs; i++) { if (t->tab[i] && t->tab[i]->prio < q->prio && !red_is_idling(&t->tab[i]->vars)) qavg += t->tab[i]->vars.qavg; } } q->packetsin++; q->bytesin += qdisc_pkt_len(skb); if (gred_wred_mode(t)) gred_load_wred_set(t, q); q->vars.qavg = red_calc_qavg(&q->parms, &q->vars, gred_backlog(t, q, sch)); if (red_is_idling(&q->vars)) red_end_of_idle_period(&q->vars); if (gred_wred_mode(t)) gred_store_wred_set(t, q); switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) { case RED_DONT_MARK: break; case RED_PROB_MARK: qdisc_qstats_overlimit(sch); if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; } q->stats.prob_mark++; break; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); if (gred_use_harddrop(q) || !gred_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; goto congestion_drop; } q->stats.forced_mark++; break; } if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) { q->backlog += qdisc_pkt_len(skb); return qdisc_enqueue_tail(skb, sch); } q->stats.pdrop++; drop: return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); congestion_drop: qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED); return NET_XMIT_CN; } static struct sk_buff *gred_dequeue(struct Qdisc *sch) { struct sk_buff *skb; struct gred_sched *t = qdisc_priv(sch); skb = qdisc_dequeue_head(sch); if (skb) { struct gred_sched_data *q; u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n", tc_index_to_dp(skb)); } else { q->backlog -= qdisc_pkt_len(skb); if (gred_wred_mode(t)) { if (!sch->qstats.backlog) red_start_of_idle_period(&t->wred_set); } else { if (!q->backlog) red_start_of_idle_period(&q->vars); } } return skb; } return NULL; } static void gred_reset(struct Qdisc *sch) { int i; struct gred_sched *t = qdisc_priv(sch); qdisc_reset_queue(sch); for (i = 0; i < t->DPs; i++) { struct gred_sched_data *q = t->tab[i]; if (!q) continue; red_restart(&q->vars); q->backlog = 0; } } static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) { struct gred_sched *table = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_gred_qopt_offload *opt = table->opt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; memset(opt, 0, sizeof(*opt)); opt->command = command; opt->handle = sch->handle; opt->parent = sch->parent; if (command == TC_GRED_REPLACE) { unsigned int i; opt->set.grio_on = gred_rio_mode(table); opt->set.wred_on = gred_wred_mode(table); opt->set.dp_cnt = table->DPs; opt->set.dp_def = table->def; for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; if (!q) continue; opt->set.tab[i].present = true; opt->set.tab[i].limit = q->limit; opt->set.tab[i].prio = q->prio; opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; opt->set.tab[i].is_ecn = gred_use_ecn(q); opt->set.tab[i].is_harddrop = gred_use_harddrop(q); opt->set.tab[i].probability = q->parms.max_P; opt->set.tab[i].backlog = &q->backlog; } opt->set.qstats = &sch->qstats; } dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); } static int gred_offload_dump_stats(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt_offload *hw_stats; u64 bytes = 0, packets = 0; unsigned int i; int ret; hw_stats = kzalloc_obj(*hw_stats); if (!hw_stats) return -ENOMEM; hw_stats->command = TC_GRED_STATS; hw_stats->handle = sch->handle; hw_stats->parent = sch->parent; for (i = 0; i < MAX_DPs; i++) { gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); if (table->tab[i]) hw_stats->stats.xstats[i] = &table->tab[i]->stats; } ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); /* Even if driver returns failure adjust the stats - in case offload * ended but driver still wants to adjust the values. */ sch_tree_lock(sch); for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; sch->qstats.drops += hw_stats->stats.qstats[i].drops; sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } _bstats_update(&sch->bstats, bytes, packets); sch_tree_unlock(sch); kfree(hw_stats); return ret; } static inline void gred_destroy_vq(struct gred_sched_data *q) { kfree(q); } static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; bool red_flags_changed; int i; if (!dps) return -EINVAL; sopt = nla_data(dps); if (sopt->DPs > MAX_DPs) { NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high"); return -EINVAL; } if (sopt->DPs == 0) { NL_SET_ERR_MSG_MOD(extack, "number of virtual queues can't be 0"); return -EINVAL; } if (sopt->def_DP >= sopt->DPs) { NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count"); return -EINVAL; } if (sopt->flags && gred_per_vq_red_flags_used(table)) { NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used"); return -EINVAL; } sch_tree_lock(sch); table->DPs = sopt->DPs; table->def = sopt->def_DP; red_flags_changed = table->red_flags != sopt->flags; table->red_flags = sopt->flags; /* * Every entry point to GRED is synchronized with the above code * and the DP is checked against DPs, i.e. shadowed VQs can no * longer be found so we can unlock right here. */ sch_tree_unlock(sch); if (sopt->grio) { gred_enable_rio_mode(table); gred_disable_wred_mode(table); if (gred_wred_mode_check(sch)) gred_enable_wred_mode(table); } else { gred_disable_rio_mode(table); gred_disable_wred_mode(table); } if (red_flags_changed) for (i = 0; i < table->DPs; i++) if (table->tab[i]) table->tab[i]->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", i); gred_destroy_vq(table->tab[i]); table->tab[i] = NULL; } } gred_offload(sch, TC_GRED_REPLACE); return 0; } static inline int gred_change_vq(struct Qdisc *sch, int dp, struct tc_gred_qopt *ctl, int prio, u8 *stab, u32 max_P, struct gred_sched_data **prealloc, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } if (!q) { table->tab[dp] = q = *prealloc; *prealloc = NULL; if (!q) return -ENOMEM; q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS; } q->DP = dp; q->prio = prio; if (ctl->limit > sch->limit) q->limit = sch->limit; else q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, stab, max_P); red_set_vars(&q->vars); return 0; } static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = { [TCA_GRED_VQ_DP] = { .type = NLA_U32 }, [TCA_GRED_VQ_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = { [TCA_GRED_VQ_ENTRY] = { .type = NLA_NESTED }, }; static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, [TCA_GRED_LIMIT] = { .type = NLA_U32 }, [TCA_GRED_VQ_LIST] = { .type = NLA_NESTED }, }; static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry) { struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; u32 dp; nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL); dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); if (tb[TCA_GRED_VQ_FLAGS]) table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); } static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs) { const struct nlattr *attr; int rem; nla_for_each_nested(attr, vqs, rem) { switch (nla_type(attr)) { case TCA_GRED_VQ_ENTRY: gred_vq_apply(table, attr); break; } } } static int gred_vq_validate(struct gred_sched *table, u32 cdp, const struct nlattr *entry, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; int err; u32 dp; err = nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, extack); if (err < 0) return err; if (!tb[TCA_GRED_VQ_DP]) { NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified"); return -EINVAL; } dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); if (dp >= table->DPs) { NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds"); return -EINVAL; } if (dp != cdp && !table->tab[dp]) { NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated"); return -EINVAL; } if (tb[TCA_GRED_VQ_FLAGS]) { u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]); if (table->red_flags && table->red_flags != red_flags) { NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used"); return -EINVAL; } if (red_flags & ~GRED_VQ_RED_FLAGS) { NL_SET_ERR_MSG_MOD(extack, "invalid RED flags specified"); return -EINVAL; } } return 0; } static int gred_vqs_validate(struct gred_sched *table, u32 cdp, struct nlattr *vqs, struct netlink_ext_ack *extack) { const struct nlattr *attr; int rem, err; err = nla_validate_nested_deprecated(vqs, TCA_GRED_VQ_ENTRY_MAX, gred_vqe_policy, extack); if (err < 0) return err; nla_for_each_nested(attr, vqs, rem) { switch (nla_type(attr)) { case TCA_GRED_VQ_ENTRY: err = gred_vq_validate(table, cdp, attr, extack); if (err) return err; break; default: NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes"); return -EINVAL; } } if (rem > 0) { NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list"); return -EINVAL; } return 0; } static int gred_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt *ctl; struct nlattr *tb[TCA_GRED_MAX + 1]; int err, prio = GRED_DEF_PRIO; u8 *stab; u32 max_P; struct gred_sched_data *prealloc; err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } if (tb[TCA_GRED_PARMS] == NULL || tb[TCA_GRED_STAB] == NULL || tb[TCA_GRED_LIMIT] != NULL) { NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time"); return -EINVAL; } max_P = nla_get_u32_default(tb[TCA_GRED_MAX_P], 0); ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); if (ctl->DP >= table->DPs) { NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count"); return -EINVAL; } if (tb[TCA_GRED_VQ_LIST]) { err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST], extack); if (err) return err; } if (gred_rio_mode(table)) { if (ctl->prio == 0) { int def_prio = GRED_DEF_PRIO; if (table->tab[table->def]) def_prio = table->tab[table->def]->prio; printk(KERN_DEBUG "GRED: DP %u does not have a prio " "setting default to %d\n", ctl->DP, def_prio); prio = def_prio; } else prio = ctl->prio; } prealloc = kzalloc_obj(*prealloc); sch_tree_lock(sch); err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc, extack); if (err < 0) goto err_unlock_free; if (tb[TCA_GRED_VQ_LIST]) gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]); if (gred_rio_mode(table)) { gred_disable_wred_mode(table); if (gred_wred_mode_check(sch)) gred_enable_wred_mode(table); } sch_tree_unlock(sch); kfree(prealloc); gred_offload(sch, TC_GRED_REPLACE); return 0; err_unlock_free: sch_tree_unlock(sch); kfree(prealloc); return err; } static int gred_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct gred_sched *table = qdisc_priv(sch); struct nlattr *tb[TCA_GRED_MAX + 1]; int err; if (!opt) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy, extack); if (err < 0) return err; if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) { NL_SET_ERR_MSG_MOD(extack, "virtual queue configuration can't be specified at initialization time"); return -EINVAL; } if (tb[TCA_GRED_LIMIT]) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); else sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { table->opt = kzalloc_obj(*table->opt); if (!table->opt) return -ENOMEM; } return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); struct nlattr *parms, *vqs, *opts = NULL; int i; u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { .DPs = table->DPs, .def_DP = table->def, .grio = gred_rio_mode(table), .flags = table->red_flags, }; if (gred_offload_dump_stats(sch)) goto nla_put_failure; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; max_p[i] = q ? q->parms.max_P : 0; } if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) goto nla_put_failure; /* Old style all-in-one dump of VQs */ parms = nla_nest_start_noflag(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; unsigned long qavg; memset(&opt, 0, sizeof(opt)); if (!q) { /* hack -- fix at some point with proper message This is how we indicate to tc that there is no VQ at this DP */ opt.DP = MAX_DPs + i; goto append_opt; } opt.limit = q->limit; opt.DP = q->DP; opt.backlog = gred_backlog(table, q, sch); opt.prio = q->prio; opt.qth_min = q->parms.qth_min >> q->parms.Wlog; opt.qth_max = q->parms.qth_max >> q->parms.Wlog; opt.Wlog = q->parms.Wlog; opt.Plog = q->parms.Plog; opt.Scell_log = q->parms.Scell_log; opt.early = q->stats.prob_drop; opt.forced = q->stats.forced_drop; opt.pdrop = q->stats.pdrop; opt.packets = q->packetsin; opt.bytesin = q->bytesin; if (gred_wred_mode(table)) gred_load_wred_set(table, q); qavg = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg >> q->parms.Wlog); opt.qave = qavg >> q->parms.Wlog; append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) goto nla_put_failure; } nla_nest_end(skb, parms); /* Dump the VQs again, in more structured way */ vqs = nla_nest_start_noflag(skb, TCA_GRED_VQ_LIST); if (!vqs) goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct nlattr *vq; if (!q) continue; vq = nla_nest_start_noflag(skb, TCA_GRED_VQ_ENTRY); if (!vq) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags)) goto nla_put_failure; /* Stats */ if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin, TCA_GRED_VQ_PAD)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG, gred_backlog(table, q, sch))) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP, q->stats.prob_drop)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK, q->stats.prob_mark)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP, q->stats.forced_drop)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK, q->stats.forced_mark)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop)) goto nla_put_failure; nla_nest_end(skb, vq); } nla_nest_end(skb, vqs); return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static void gred_destroy(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); int i; for (i = 0; i < table->DPs; i++) gred_destroy_vq(table->tab[i]); if (table->opt) gred_offload(sch, TC_GRED_DESTROY); kfree(table->opt); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .id = "gred", .priv_size = sizeof(struct gred_sched), .enqueue = gred_enqueue, .dequeue = gred_dequeue, .peek = qdisc_peek_head, .init = gred_init, .reset = gred_reset, .destroy = gred_destroy, .change = gred_change, .dump = gred_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("gred"); static int __init gred_module_init(void) { return register_qdisc(&gred_qdisc_ops); } static void __exit gred_module_exit(void) { unregister_qdisc(&gred_qdisc_ops); } module_init(gred_module_init) module_exit(gred_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic Random Early Detection qdisc");
18141 18183 18237 18695 18757 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif
40 1516 1515 10 10 10 10 10 10 10 10 3 7 3 7 10 10 10 10 7 7 7 7 3 3 3 7 7 7 7 7 7 40 40 40 40 40 40 40 40 40 3 3 3 361 360 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1518 1518 1522 1519 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 // SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE 64 /* * Lockless lists for tracking IO stats update * * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). * There are multiple blkg's (one for each block device) attached to each * blkcg. The rstat code keeps track of which cpu has IO stats updated, * but it doesn't know which blkg has the updated stats. If there are many * block devices in a system, the cost of iterating all the blkg's to flush * out the IO stats can be high. To reduce such overhead, a set of percpu * lockless lists (lhead) per blkcg are used to track the set of recently * updated iostat_cpu's since the last flush. An iostat_cpu will be put * onto the lockless list on the update side [blk_cgroup_bio_start()] if * not there yet and then removed when being flushed [blkcg_rstat_flush()]. * References to blkg are gotten and then put back in the process to * protect against blkg removal. * * Return: 0 if successful or -ENOMEM if allocation fails. */ static int init_blkcg_llists(struct blkcg *blkcg) { int cpu; blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); if (!blkcg->lhead) return -ENOMEM; for_each_possible_cpu(cpu) init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); return 0; } /** * blkcg_css - find the current css * * Find the css associated with either the kthread or the current task. * This may return a dying css, so it is up to the caller to use tryget logic * to confirm it is alive and well. */ static struct cgroup_subsys_state *blkcg_css(void) { struct cgroup_subsys_state *css; css = kthread_blkcg(); if (css) return css; return task_css(current, io_cgrp_id); } static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); struct request_queue *q = blkg->q; int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), * in order to make sure pd_free_fn() is called in order, the deletion * of the list blkg->q_node is delayed to here from blkg_destroy(), and * blkcg_mutex is used to synchronize blkg_free_workfn() and * blkcg_deactivate_policy(). */ mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { if (!blkg) return; /* * Both ->pd_free_fn() and request queue's release handler may * sleep, so free us by scheduling one work func */ INIT_WORK(&blkg->free_work, blkg_free_workfn); schedule_work(&blkg->free_work); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); struct blkcg *blkcg = blkg->blkcg; int cpu; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); #endif /* * Flush all the non-empty percpu lockless lists before releasing * us, given these stat belongs to us. * * blkg_stat_lock is for serializing blkg stat update */ for_each_possible_cpu(cpu) __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); call_rcu(&blkg->rcu_head, __blkg_release); } #ifdef CONFIG_BLK_CGROUP_PUNT_BIO static struct workqueue_struct *blkcg_punt_bio_wq; static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; struct blk_plug plug; bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { need_plug = true; blk_start_plug(&plug); } while ((bio = bio_list_pop(&bios))) submit_bio(bio); if (need_plug) blk_finish_plug(&plug); } /* * When a shared kthread issues a bio for a cgroup, doing so synchronously can * lead to priority inversions as the kthread can be trapped waiting for that * cgroup. Use this helper instead of submit_bio to punt the actual issuing to * a dedicated per-blkcg work item to avoid such priority inversions. */ void blkcg_punt_bio_submit(struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; if (blkg->parent) { spin_lock(&blkg->async_bio_lock); bio_list_add(&blkg->async_bios, bio); spin_unlock(&blkg->async_bio_lock); queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); } else { /* never bounce for the root cgroup */ submit_bio(bio); } } EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); static int __init blkcg_punt_bio_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND | WQ_SYSFS, 0); if (!blkcg_punt_bio_wq) return -ENOMEM; return 0; } subsys_initcall(blkcg_punt_bio_init); #endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio * * This returns the CSS for the blkcg associated with a bio, or %NULL if not * associated. Callers are expected to either handle %NULL or know association * has been done prior to calling this. */ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { if (!bio || !bio->bi_blkg) return NULL; return &bio->bi_blkg->blkcg->css; } EXPORT_SYMBOL_GPL(bio_blkcg_css); /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest * * Return the parent blkcg of @blkcg. Can be called anytime. */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { return css_to_blkcg(blkcg->css.parent); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; blkg->iostat.blkg = blkg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); #endif u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; pd->online = false; } return blkg; out_free_pds: while (--i >= 0) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); blk_put_queue(disk->queue); out_free_iostat: free_percpu(blkg->iostat_cpu); out_exit_refcnt: percpu_ref_exit(&blkg->refcnt); out_free_blkg: kfree(blkg); return NULL; } /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { ret = -ENODEV; goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; } blkg_get(blkg->parent); } /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) pol->pd_init_fn(blkg->pd[i]); } /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i]) { if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[i]); blkg->pd[i]->online = true; } } } blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: if (new_blkg) blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @disk: gendisk of interest * * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); blkg = blkg_lookup(blkcg, q); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; break; } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; } if (pos == blkcg) break; } found: spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; int i; lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* * blkg stays on the queue list until blkg_free_workfn(), see details in * blkg_free_workfn(), hence this function can be called from * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before * blkg_free_workfn(). */ if (hlist_unhashed(&blkg->blkcg_node)) return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && blkg->pd[i]->online) { blkg->pd[i]->online = false; if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ percpu_ref_kill(&blkg->refcnt); } static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) continue; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); /* * in order to avoid holding the spin lock for too long, release * it when a batch of blkgs are destroyed. */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); cond_resched(); goto restart; } } /* * Mark policy deactivated since policy offline has been done, and * the free is scheduled, so future blkcg_deactivate_policy() can * be bypassed */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (pol) __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] = src->bytes[i]; dst->ios[i] = src->ios[i]; } } static void __blkg_clear_stat(struct blkg_iostat_set *bis) { struct blkg_iostat cur = {0}; unsigned long flags; flags = u64_stats_update_begin_irqsave(&bis->sync); blkg_iostat_set(&bis->cur, &cur); blkg_iostat_set(&bis->last, &cur); u64_stats_update_end_irqrestore(&bis->sync, flags); } static void blkg_clear_stat(struct blkcg_gq *blkg) { int cpu; for_each_possible_cpu(cpu) { struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); __blkg_clear_stat(s); } __blkg_clear_stat(&blkg->iostat); } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg->pd[i]); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } const char *blkg_dev_name(struct blkcg_gq *blkg) { if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; u64 total = 0; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** * blkg_conf_init - initialize a blkg_conf_ctx * @ctx: blkg_conf_ctx to initialize * @input: input string * * Initialize @ctx which can be used to parse blkg config input string @input. * Once initialized, @ctx can be used with blkg_conf_open_bdev() and * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). */ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { *ctx = (struct blkg_conf_ctx){ .input = input }; } EXPORT_SYMBOL_GPL(blkg_conf_init); /** * blkg_conf_open_bdev - parse and open bdev for per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is * set to point past the device node prefix. * * This function may be called multiple times on @ctx and the extra calls become * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function * explicitly if bdev access is needed without resolving the blkcg / policy part * of @ctx->input. Returns -errno on error. */ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; if (ctx->bdev) return 0; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; input += key_len; if (!isspace(*input)) return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor), false); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); return -ENODEV; } mutex_lock(&bdev->bd_queue->rq_qos_mutex); if (!disk_live(bdev->bd_disk)) { blkdev_put_no_open(bdev); mutex_unlock(&bdev->bd_queue->rq_qos_mutex); return -ENODEV; } ctx->body = input; ctx->bdev = bdev; return 0; } /* * Similar to blkg_conf_open_bdev, but additionally freezes the queue, * ensures the correct locking order between freeze queue and q->rq_qos_mutex. * * This function returns negative error on failure. On success it returns * memflags which must be saved and later passed to blkg_conf_exit_frozen * for restoring the memalloc scope. */ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) { int ret; unsigned long memflags; if (ctx->bdev) return -EINVAL; ret = blkg_conf_open_bdev(ctx); if (ret < 0) return ret; /* * At this point, we haven’t started protecting anything related to QoS, * so we release q->rq_qos_mutex here, which was first acquired in blkg_ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing * the queue to maintain the correct locking order. */ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); return memflags; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse per-blkg config update from @ctx->input and initialize @ctx * accordingly. On success, @ctx->body points to the part of @ctx->input * following MAJ:MIN, @ctx->bdev points to the target block device and * @ctx->blkg to the blkg being configured. * * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this * function returns with queue lock held and must be followed by * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) __acquires(&bdev->bd_queue->queue_lock) { struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; ret = blkg_conf_open_bdev(ctx); if (ret) return ret; disk = ctx->bdev->bd_disk; q = disk->queue; /* Prevent concurrent with blkcg_deactivate_policy() */ mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { ret = -EOPNOTSUPP; goto fail_unlock; } blkg = blkg_lookup(blkcg, q); if (blkg) goto success; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); new_blkg = blkg_alloc(pos, disk, GFP_NOIO); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; goto fail_exit; } spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_preloaded; } blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; } } radix_tree_preload_end(); if (pos == blkcg) goto success; } success: mutex_unlock(&q->blkcg_mutex); ctx->blkg = blkg; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); fail_exit: mutex_unlock(&q->blkcg_mutex); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue * can be bypassing for some time and it's always nice to * avoid busy looping. */ if (ret == -EBUSY) { msleep(10); ret = restart_syscall(); } return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_exit - clean up per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Clean up after per-blkg config update. This function must be called on all * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); ctx->blkg = NULL; } if (ctx->bdev) { mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; } } EXPORT_SYMBOL_GPL(blkg_conf_exit); /* * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used * when blkg_conf_open_bdev_frozen is used to open the bdev. */ void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) { if (ctx->bdev) { struct request_queue *q = ctx->bdev->bd_queue; blkg_conf_exit(ctx); blk_mq_unfreeze_queue(q, memflags); } } static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] += src->bytes[i]; dst->ios[i] += src->ios[i]; } } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] -= src->bytes[i]; dst->ios[i] -= src->ios[i]; } } static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, struct blkg_iostat *last) { struct blkg_iostat delta; unsigned long flags; /* propagate percpu delta to global */ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, cur); blkg_iostat_sub(&delta, last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(last, &delta); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; unsigned long flags; rcu_read_lock(); lnode = llist_del_all(lhead); if (!lnode) goto out; /* * For covering concurrent parent blkg update from blkg_release(). * * When flushing from cgroup, the subsystem rstat lock is always held, * so this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. */ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; struct blkg_iostat cur; unsigned int seq; /* * Order assignment of `next_bisc` from `bisc->lnode.next` in * llist_for_each_entry_safe and clearing `bisc->lqueued` for * avoiding to assign `next_bisc` with new next pointer added * in blk_cgroup_bio_start() in case of re-ordering. * * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). */ smp_mb(); WRITE_ONCE(bisc->lqueued, false); if (bisc == &blkg->iostat) goto propagate_up; /* propagate up to parent only */ /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); blkcg_iostat_update(blkg, &cur, &bisc->last); propagate_up: /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); /* * Queue parent->iostat to its blkcg's lockless * list to propagate up to the grandparent if the * iostat hasn't been queued yet. */ if (!parent->iostat.lqueued) { struct llist_head *plhead; plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); llist_add(&parent->iostat.lnode, plhead); parent->iostat.lqueued = true; } } } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { /* Root-level stats are sourced from system-wide IO stats */ if (cgroup_parent(css->cgroup)) __blkcg_rstat_flush(css_to_blkcg(css), cpu); } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { struct class_dev_iter iter; struct device *dev; class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += cpu_dkstats->ios[STAT_WRITE]; tmp.ios[BLKG_IOSTAT_DISCARD] += cpu_dkstats->ios[STAT_DISCARD]; // convert sectors to bytes tmp.bytes[BLKG_IOSTAT_READ] += cpu_dkstats->sectors[STAT_READ] << 9; tmp.bytes[BLKG_IOSTAT_WRITE] += cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; } flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; const char *dname; unsigned seq; int i; if (!blkg->online) return; dname = blkg_dev_name(blkg); if (!dname) return; seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; rios = bis->cur.ios[BLKG_IOSTAT_READ]; wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; pol->pd_stat_fn(blkg->pd[i], s); } seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); return 0; } static struct cftype blkcg_files[] = { { .name = "stat", .seq_show = blkcg_print_stat, }, { } /* terminate */ }; static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, { } /* terminate */ }; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) { return &css_to_blkcg(css)->cgwb_list; } #endif /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* * Given that the system can accumulate a huge number * of blkgs in pathological cases, check to see if we * need to rescheduling to avoid softlockup. */ spin_unlock_irq(&blkcg->lock); cond_resched(); spin_lock_irq(&blkcg->lock); continue; } blkg_destroy(blkg); spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } /** * blkcg_pin_online - pin online state * @blkcg_css: blkcg of interest * * While pinned, a blkcg is kept online. This is primarily used to * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline * while an associated cgwb is still active. */ void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) { refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); } /** * blkcg_unpin_online - unpin online state * @blkcg_css: blkcg of interest * * This is primarily used to impedance-match blkg and cgwb lifetimes so * that blkg doesn't go offline while an associated cgwb is still active. * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). */ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) { struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { struct blkcg *parent; if (!refcount_dec_and_test(&blkcg->online_pin)) break; parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); blkcg = parent; } while (blkcg); } /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(css); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(css); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); mutex_unlock(&blkcg_pol_mutex); free_percpu(blkcg->lhead); kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; int i; mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; } else { blkcg = kzalloc_obj(*blkcg); if (!blkcg) goto unlock; } if (init_blkcg_llists(blkcg)) goto free_blkcg; for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; /* * If the policy hasn't been attached yet, wait for it * to be attached before doing anything else. Otherwise, * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ if (!pol || !pol->cpd_alloc_fn) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto free_pd_blkcg; blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_percpu(blkcg->lhead); free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs * don't go offline while cgwbs are still active on them. Pin the * parent so that offline always happens towards the root. */ if (parent) blkcg_pin_online(&parent->css); return 0; } void blkg_init_queue(struct request_queue *q) { INIT_LIST_HEAD(&q->blkg_list); mutex_init(&q->blkcg_mutex); } int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return 0; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) put_disk(tsk->throttle_disk); tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to activate * * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; /* * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, * for example, ioprio. Such policy will work on blkcg level, not disk * level, and don't need to be activated. */ if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) return -EINVAL; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_NOWAIT); } if (!pd) { /* * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ if (pinned_blkg) blkg_put(pinned_blkg); blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } spin_lock(&blkg->blkcg->lock); pd->blkg = blkg; pd->plid = pol->plid; blkg->pd[pol->plid] = pd; if (pol->pd_init_fn) pol->pd_init_fn(pd); if (pol->pd_online_fn) pol->pd_online_fn(pd); pd->online = true; spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; struct blkg_policy_data *pd; spin_lock(&blkcg->lock); pd = blkg->pd[pol->plid]; if (pd) { if (pd->online && pol->pd_offline_fn) pol->pd_offline_fn(pd); pd->online = false; pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); static void blkcg_free_all_cpd(struct blkcg_policy *pol) { struct blkcg *blkcg; list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { struct blkcg *blkcg; int i, ret; /* * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy * without pd_alloc_fn/pd_free_fn can't be activated. */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) return -EINVAL; mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); ret = -ENOSPC; goto err_unlock; } /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = -ENOMEM; goto err_free_cpds; } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->dfl_cftypes == pol->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); } mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ if (pol->dfl_cftypes) cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); /* negative use_delay means no scaling, see blkcg_set_delay() */ if (atomic_read(&blkg->use_delay) < 0) return; /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain * time window. We only want to throttle tasks for recent delay that * has occurred, in 1 second time windows since that's the maximum * things can be throttled. We save the current delay window in * blkg->last_delay so we know what amount is still left to be charged * to the blkg from this point onward. blkg->last_use keeps track of * the use_delay counter. The idea is if we're unthrottling the blkg we * are ok with whatever is happening now, and we can take away more of * the accumulated delay as we've already throttled enough that * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); /* * We've been unthrottled, subtract a larger chunk of our * accumulated delay. */ if (cur_use < blkg->last_use) sub = max_t(u64, sub, blkg->last_delay >> 1); /* * This shouldn't happen, but handle it anyway. Our delay_nsec * should only ever be growing except here where we subtract out * min(last_delay, 1 second), but lord knows bugs happen and I'd * rather not end up with negative numbers. */ if (unlikely(cur < sub)) { atomic64_set(&blkg->delay_nsec, 0); blkg->last_delay = 0; } else { atomic64_sub(sub, &blkg->delay_nsec); blkg->last_delay = cur - sub; } blkg->last_use = cur_use; } } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { int use_delay = atomic_read(&blkg->use_delay); if (use_delay) { u64 this_delay; blkcg_scale_delay(blkg, now); this_delay = atomic64_read(&blkg->delay_nsec); if (this_delay > delay_nsec) { delay_nsec = this_delay; clamp = use_delay > 0; } } blkg = blkg->parent; } if (!delay_nsec) return; /* * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the * delays at 0.25s. If there's 10's of seconds worth of delay then the * tasks will be delayed for 0.25 second for every syscall. If * blkcg_set_delay() was used as indicated by negative use_delay, the * caller is responsible for regulating the range. */ if (clamp) delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); do { __set_current_state(TASK_KILLABLE); if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); if (use_memdelay) psi_memstall_leave(&pflags); } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; if (!disk) return; current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); put_disk(disk); return; out: rcu_read_unlock(); } /** * blkcg_schedule_throttle - this task needs to check for throttling * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; if (current->throttle_disk != disk) { if (test_bit(GD_DEAD, &disk->state)) return; get_device(disk_to_dev(disk)); if (current->throttle_disk) put_disk(current->throttle_disk); current->throttle_disk = disk; } if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; break; } blkg = blkg->parent; } rcu_read_unlock(); return ret_blkg; } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; if (blk_op_is_passthrough(bio->bi_opf)) return; rcu_read_lock(); if (bio->bi_blkg) css = bio_blkcg_css(bio); else css = blkcg_css(); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { if (src->bi_blkg) bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); static int blk_cgroup_io_type(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return BLKG_IOSTAT_DISCARD; if (op_is_write(bio->bi_opf)) return BLKG_IOSTAT_WRITE; return BLKG_IOSTAT_READ; } void blk_cgroup_bio_start(struct bio *bio) { struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) return; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split * bio and we would have already accounted for the size of the bio. */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; /* * If the iostat_cpu isn't in a lockless list, put it into the * list to indicate that a stat update is pending. */ if (!READ_ONCE(bis->lqueued)) { struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); llist_add(&bis->lnode, lhead); WRITE_ONCE(bis->lqueued, true); } u64_stats_update_end_irqrestore(&bis->sync, flags); css_rstat_updated(&blkcg->css, cpu); put_cpu(); } bool blk_cgroup_congested(void) { struct blkcg *blkcg; bool ret = false; rcu_read_lock(); for (blkcg = css_to_blkcg(blkcg_css()); blkcg; blkcg = blkcg_parent(blkcg)) { if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } } rcu_read_unlock(); return ret; } module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
4 21 9 9 3 58 4 4 1 4 17 17 17 8 57 24 24 2 22 10 72 72 72 77 77 6 1 1 35 4 31 72 72 72 63 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018, 2021-2025 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "core.h" #include "trace.h" static inline int rdev_suspend(struct cfg80211_registered_device *rdev, struct cfg80211_wowlan *wowlan) { int ret; trace_rdev_suspend(&rdev->wiphy, wowlan); ret = rdev->ops->suspend(&rdev->wiphy, wowlan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_resume(struct cfg80211_registered_device *rdev) { int ret; trace_rdev_resume(&rdev->wiphy); ret = rdev->ops->resume(&rdev->wiphy); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, bool enabled) { trace_rdev_set_wakeup(&rdev->wiphy, enabled); rdev->ops->set_wakeup(&rdev->wiphy, enabled); trace_rdev_return_void(&rdev->wiphy); } static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, type, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_del_virtual_intf(&rdev->wiphy, wdev); ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { int ret; trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type); ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast) { int ret; trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_settings *settings) { int ret; trace_rdev_start_ap(&rdev->wiphy, dev, settings); ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_update *info) { int ret; trace_rdev_change_beacon(&rdev->wiphy, dev, info); ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { int ret; trace_rdev_stop_ap(&rdev->wiphy, dev, link_id); ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_add_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct station_del_parameters *params) { int ret; trace_rdev_del_station(&rdev->wiphy, dev, params); ret = rdev->ops->del_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_change_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_get_station(&rdev->wiphy, dev, mac); ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_dump_station(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac); ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst) { int ret; trace_rdev_del_mpath(&rdev->wiphy, dev, dst); ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp); ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop); ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp); ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_config *conf) { int ret; trace_rdev_get_mesh_config(&rdev->wiphy, dev); ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf); trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf); return ret; } static inline int rdev_update_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { int ret; trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf); ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { int ret; trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup); ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_mesh(&rdev->wiphy, dev); ret = rdev->ops->leave_mesh(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup) { int ret; trace_rdev_join_ocb(&rdev->wiphy, dev, setup); ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ocb(&rdev->wiphy, dev); ret = rdev->ops->leave_ocb(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_bss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct bss_parameters *params) { int ret; trace_rdev_change_bss(&rdev->wiphy, dev, params); ret = rdev->ops->change_bss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_inform_bss(struct cfg80211_registered_device *rdev, struct cfg80211_bss *bss, const struct cfg80211_bss_ies *ies, void *drv_data) { trace_rdev_inform_bss(&rdev->wiphy, bss); if (rdev->ops->inform_bss) rdev->ops->inform_bss(&rdev->wiphy, bss, ies, drv_data); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_txq_params *params) { int ret; trace_rdev_set_txq_params(&rdev->wiphy, dev, params); ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan) { int ret; trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan); ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef); ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request_int *request) { int ret; if (WARN_ON_ONCE(!request->req.n_ssids && request->req.ssids)) return -EINVAL; trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, &request->req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_abort_scan(&rdev->wiphy, wdev); rdev->ops->abort_scan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { int ret; trace_rdev_auth(&rdev->wiphy, dev, req); ret = rdev->ops->auth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { int ret; trace_rdev_assoc(&rdev->wiphy, dev, req); ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_deauth_request *req) { int ret; trace_rdev_deauth(&rdev->wiphy, dev, req); ret = rdev->ops->deauth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_disassoc_request *req) { int ret; trace_rdev_disassoc(&rdev->wiphy, dev, req); ret = rdev->ops->disassoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme) { int ret; trace_rdev_connect(&rdev->wiphy, dev, sme); ret = rdev->ops->connect(&rdev->wiphy, dev, sme); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_connect_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme, u32 changed) { int ret; trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed); ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason_code) { int ret; trace_rdev_disconnect(&rdev->wiphy, dev, reason_code); ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params) { int ret; trace_rdev_join_ibss(&rdev->wiphy, dev, params); ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ibss(&rdev->wiphy, dev); ret = rdev->ops->leave_ibss(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx, u32 changed) { int ret = -EOPNOTSUPP; trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed); if (rdev->ops->set_wiphy_params) ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { int ret; trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int radio_idx, unsigned int link_id, int *dbm) { int ret; trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id); ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, const bool enabled) { int ret; trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled); ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_txq_stats(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { int ret; trace_rdev_get_txq_stats(&rdev->wiphy, wdev); ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); rdev->ops->rfkill_poll(&rdev->wiphy); trace_rdev_return_void(&rdev->wiphy); } #ifdef CONFIG_NL80211_TESTMODE static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, void *data, int len) { int ret; trace_rdev_testmode_cmd(&rdev->wiphy, wdev); ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { int ret; trace_rdev_testmode_dump(&rdev->wiphy); ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } #endif static inline int rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask) { int ret; trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev, struct net_device *netdev, int idx, struct survey_info *info) { int ret; trace_rdev_dump_survey(&rdev->wiphy, netdev, idx); ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info); if (ret < 0) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info); return ret; } static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev) { int ret; trace_rdev_flush_pmksa(&rdev->wiphy, netdev); ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { int ret; trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration); ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan, duration, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { int ret; trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params); ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, const bool noencrypt, int link, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link, cookie); if (cookie) trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); else trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, bool enabled, int timeout) { int ret; trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { int ret; trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 low, s32 high) { int ret; trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 rate, u32 pkts, u32 intvl) { int ret; trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { might_sleep(); trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); if (rdev->ops->update_mgmt_frame_registrations) rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, int radio_idx, u32 tx_ant, u32 rx_ant) { int ret; trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret; trace_rdev_get_antenna(&rdev->wiphy, radio_idx); ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant, *rx_ant); return ret; } static inline int rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_sched_scan_request *request) { int ret; trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev, struct net_device *dev, u64 reqid) { int ret; trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid); ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { int ret; trace_rdev_set_rekey_data(&rdev->wiphy, dev); ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, enum nl80211_tdls_operation oper) { int ret; trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper); ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_client(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u64 *cookie) { int ret; trace_rdev_probe_client(&rdev->wiphy, dev, peer); ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 noack_map) { int ret; trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map); ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_get_channel(&rdev->wiphy, wdev, link_id); ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef); trace_rdev_return_chandef(&rdev->wiphy, ret, chandef); return ret; } static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_start_p2p_device(&rdev->wiphy, wdev); ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_p2p_device(&rdev->wiphy, wdev); rdev->ops->stop_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { int ret; trace_rdev_start_nan(&rdev->wiphy, wdev, conf); ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_nan(&rdev->wiphy, wdev); rdev->ops->stop_nan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { int ret; trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_nan_change_conf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { int ret; trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); if (rdev->ops->nan_change_conf) ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, changes); else ret = -EOPNOTSUPP; trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) { int ret; trace_rdev_set_mac_acl(&rdev->wiphy, dev, params); ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie) { int ret; trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie); ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_crit_proto_id protocol, u16 duration) { int ret; trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration); ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev, protocol, duration); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_crit_proto_stop(&rdev->wiphy, wdev); rdev->ops->crit_proto_stop(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_csa_settings *params) { int ret; trace_rdev_channel_switch(&rdev->wiphy, dev, params); ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_qos_map *qos_map) { int ret = -EOPNOTSUPP; if (rdev->ops->set_qos_map) { trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map); ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map); trace_rdev_return_int(&rdev->wiphy, ret); } return ret; } static inline int rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time) { int ret = -EOPNOTSUPP; trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); if (rdev->ops->add_tx_ts) ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer) { int ret = -EOPNOTSUPP; trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); if (rdev->ops->del_tx_ts) ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr) { trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { int ret = -EOPNOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); if (rdev->ops->start_radar_detection) ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_end_cac(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { trace_rdev_end_cac(&rdev->wiphy, dev, link_id); if (rdev->ops->end_cac) rdev->ops->end_cac(&rdev->wiphy, dev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, int mcast_rate[NUM_NL80211_BANDS]) { int ret = -EOPNOTSUPP; trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); if (rdev->ops->set_mcast_rate) ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_coalesce(struct cfg80211_registered_device *rdev, struct cfg80211_coalesce *coalesce) { int ret = -EOPNOTSUPP; trace_rdev_set_coalesce(&rdev->wiphy, coalesce); if (rdev->ops->set_coalesce) ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_pmk_conf *pmk_conf) { int ret = -EOPNOTSUPP; trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf); if (rdev->ops->set_pmk) ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *aa) { int ret = -EOPNOTSUPP; trace_rdev_del_pmk(&rdev->wiphy, dev, aa); if (rdev->ops->del_pmk) ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_external_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_external_auth_params *params) { int ret = -EOPNOTSUPP; trace_rdev_external_auth(&rdev->wiphy, dev, params); if (rdev->ops->external_auth) ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { int ret = -EOPNOTSUPP; trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); if (rdev->ops->get_ftm_responder_stats) ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->start_pmsr) ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->abort_pmsr) rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_owe_info *oweinfo) { int ret = -EOPNOTSUPP; trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); if (rdev->ops->update_owe_info) ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *dest, const void *buf, size_t len) { int ret; trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { int ret; trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf); ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u8 tids) { int ret; trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids); ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, struct cfg80211_sar_specs *sar) { int ret; trace_rdev_set_sar_specs(&rdev->wiphy, sar); ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_color_change(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_color_change_settings *params) { int ret; trace_rdev_color_change(&rdev->wiphy, dev, params); ret = rdev->ops->color_change(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_fils_aad(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_fils_aad *fils_aad) { int ret = -EOPNOTSUPP; trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad); if (rdev->ops->set_fils_aad) ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_radar_background(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_radar_background(wiphy, chandef); if (rdev->ops->set_radar_background) ret = rdev->ops->set_radar_background(wiphy, chandef); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_add_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { int ret = 0; trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->add_intf_link) ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->del_intf_link) rdev->ops->del_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_add_link_station(&rdev->wiphy, dev, params); if (rdev->ops->add_link_station) ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mod_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_mod_link_station(&rdev->wiphy, dev, params); if (rdev->ops->mod_link_station) ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_del_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_del_link_station(&rdev->wiphy, dev, params); if (rdev->ops->del_link_station) ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_hw_timestamp(wiphy, dev, hwts); if (rdev->ops->set_hw_timestamp) ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_set_ttlm(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_ttlm(wiphy, dev, params); if (rdev->ops->set_ttlm) ret = rdev->ops->set_ttlm(wiphy, dev, params); trace_rdev_return_int(wiphy, ret); return ret; } static inline u32 rdev_get_radio_mask(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wiphy *wiphy = &rdev->wiphy; if (!rdev->ops->get_radio_mask) return 0; return rdev->ops->get_radio_mask(wiphy, dev); } static inline int rdev_assoc_ml_reconf(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ml_reconf_req *req) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_assoc_ml_reconf(wiphy, dev, req); if (rdev->ops->assoc_ml_reconf) ret = rdev->ops->assoc_ml_reconf(wiphy, dev, req); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_set_epcs(struct cfg80211_registered_device *rdev, struct net_device *dev, bool val) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_epcs(wiphy, dev, val); if (rdev->ops->set_epcs) ret = rdev->ops->set_epcs(wiphy, dev, val); trace_rdev_return_int(wiphy, ret); return ret; } #endif /* __CFG80211_RDEV_OPS */
16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/ip.h> #include <linux/sctp.h> #include <net/ip.h> #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/sctp/checksum.h> #include <net/ip_vs.h> static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int sctphoff); static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct sctp_chunkhdr _schunkh, *sch; struct sctphdr *sh, _sctph; __be16 _ports[2], *ports = NULL; if (likely(!ip_vs_iph_icmp(iph))) { sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); if (sh) { sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); if (sch) { if (sch->type == SCTP_CID_ABORT || !(sysctl_sloppy_sctp(ipvs) || sch->type == SCTP_CID_INIT)) return 1; ports = &sh->source; } } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static void sctp_nat_csum(struct sk_buff *skb, struct sctphdr *sctph, unsigned int sctphoff) { sctph->checksum = sctp_compute_cksum(skb, sctphoff); skb->ip_summed = CHECKSUM_UNNECESSARY; } static int sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct sctphdr *sctph; unsigned int sctphoff = iph->len; bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ ret = ip_vs_app_pkt_out(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 2) payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; /* Only update csum if we really have to */ if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; if (!skb_is_gso(skb)) sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct sctphdr *sctph; unsigned int sctphoff = iph->len; bool payload_csum = false; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ ret = ip_vs_app_pkt_in(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 2) payload_csum = true; } sctph = (void *) skb_network_header(skb) + sctphoff; /* Only update csum if we really have to */ if (sctph->dest != cp->dport || payload_csum || (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; if (!skb_is_gso(skb)) sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int sctphoff) { struct sctphdr *sh; __le32 cmp, val; sh = (struct sctphdr *)(skb->data + sctphoff); cmp = sh->checksum; val = sctp_compute_cksum(skb, sctphoff); if (val != cmp) { /* CRC failure, dump it. */ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } return 1; } enum ipvs_sctp_event_t { IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ IP_VS_SCTP_INIT, IP_VS_SCTP_INIT_ACK, IP_VS_SCTP_COOKIE_ECHO, IP_VS_SCTP_COOKIE_ACK, IP_VS_SCTP_SHUTDOWN, IP_VS_SCTP_SHUTDOWN_ACK, IP_VS_SCTP_SHUTDOWN_COMPLETE, IP_VS_SCTP_ERROR, IP_VS_SCTP_ABORT, IP_VS_SCTP_EVENT_LAST }; /* RFC 2960, 3.2 Chunk Field Descriptions */ static __u8 sctp_events[] = { [SCTP_CID_DATA] = IP_VS_SCTP_DATA, [SCTP_CID_INIT] = IP_VS_SCTP_INIT, [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, [SCTP_CID_SACK] = IP_VS_SCTP_DATA, [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, }; /* SCTP States: * See RFC 2960, 4. SCTP Association State Diagram * * New states (not in diagram): * - INIT1 state: use shorter timeout for dropped INIT packets * - REJECTED state: use shorter timeout if INIT is rejected with ABORT * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging * * The states are as seen in real server. In the diagram, INIT1, INIT, * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. * * States as per packets from client (C) and server (S): * * Setup of client connection: * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK * * Setup of server connection: * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK */ #define sNO IP_VS_SCTP_S_NONE #define sI1 IP_VS_SCTP_S_INIT1 #define sIN IP_VS_SCTP_S_INIT #define sCS IP_VS_SCTP_S_COOKIE_SENT #define sCR IP_VS_SCTP_S_COOKIE_REPLIED #define sCW IP_VS_SCTP_S_COOKIE_WAIT #define sCO IP_VS_SCTP_S_COOKIE #define sCE IP_VS_SCTP_S_COOKIE_ECHOED #define sES IP_VS_SCTP_S_ESTABLISHED #define sSS IP_VS_SCTP_S_SHUTDOWN_SENT #define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED #define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT #define sRJ IP_VS_SCTP_S_REJECTED #define sCL IP_VS_SCTP_S_CLOSED static const __u8 sctp_states [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { { /* INPUT */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, /* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, /* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, /* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, { /* OUTPUT */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, /* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, /* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, { /* INPUT-ONLY */ /* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ /* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, /* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, /* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, /* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, /* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, /* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, /* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, }, }; #define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) /* Timeout table[state] */ static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = 2 * HZ, [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, [IP_VS_SCTP_S_LAST] = 2 * HZ, }; static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = "NONE", [IP_VS_SCTP_S_INIT1] = "INIT1", [IP_VS_SCTP_S_INIT] = "INIT", [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", [IP_VS_SCTP_S_COOKIE] = "COOKIE", [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", [IP_VS_SCTP_S_REJECTED] = "REJECTED", [IP_VS_SCTP_S_CLOSED] = "CLOSED", [IP_VS_SCTP_S_LAST] = "BUG!", }; static const char *sctp_state_name(int state) { if (state >= IP_VS_SCTP_S_LAST) return "ERR!"; if (sctp_state_name_table[state]) return sctp_state_name_table[state]; return "?"; } static inline void set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) { struct sctp_chunkhdr _sctpch, *sch; unsigned char chunk_type; int event, next_state; int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else ihl = ip_hdrlen(skb); #endif cofs = ihl + sizeof(struct sctphdr); sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; chunk_type = sch->type; /* * Section 3: Multiple chunks can be bundled into one SCTP packet * up to the MTU size, except for the INIT, INIT ACK, and * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with * any other chunk in a packet. * * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be * bundled with an ABORT, but they MUST be placed before the ABORT * in the SCTP packet or they will be ignored by the receiver. */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { int clen = ntohs(sch->length); if (clen >= sizeof(_sctpch)) { sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), sizeof(_sctpch), &_sctpch); if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } event = (chunk_type < sizeof(sctp_events)) ? sctp_events[chunk_type] : IP_VS_SCTP_DATA; /* Update direction to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (direction == IP_VS_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else direction = IP_VS_DIR_INPUT_ONLY; } next_state = sctp_states[direction][event][cp->state]; if (next_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), sctp_state_name(cp->state), sctp_state_name(next_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state != IP_VS_SCTP_S_ESTABLISHED)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state == IP_VS_SCTP_S_ESTABLISHED)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (next_state == IP_VS_SCTP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; else /* What to do ? */ cp->timeout = sctp_timeouts[cp->state = next_state]; } static void sctp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { spin_lock_bh(&cp->lock); set_sctp_state(pd, cp, direction, skb); spin_unlock_bh(&cp->lock); } static inline __u16 sctp_app_hashkey(__be16 port) { return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) & SCTP_APP_TAB_MASK; } static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); hash = sctp_app_hashkey(port); list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); if (!pd->timeout_table) return -ENOMEM; return 0; } static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_sctp = { .name = "SCTP", .protocol = IPPROTO_SCTP, .num_states = IP_VS_SCTP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_sctp_init, .exit_netns = __ip_vs_sctp_exit, .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, .conn_schedule = sctp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = sctp_snat_handler, .dnat_handler = sctp_dnat_handler, .state_name = sctp_state_name, .state_transition = sctp_state_transition, .app_conn_bind = sctp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, };
8 3 3 3 8 3 8 3 7 8 8 8 8 3 7 8 35 6 35 1 1 8 8 8 26 27 27 2 25 25 5 5 25 12 1 8 3 9 2 1 10 9 1 6 4 5 5 7 3 8 2 10 10 14 2 12 12 37 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 // SPDX-License-Identifier: GPL-2.0-only /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <vtlam@google.com> * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ #define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ #define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ #define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ #define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ #define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ enum wdrr_bucket_idx { WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ }; #define hhf_time_before(a, b) \ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) /* Heavy-hitter per-flow state */ struct hh_flow_state { u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ u32 hit_timestamp; /* last time heavy-hitter was seen */ struct list_head flowchain; /* chaining under hash collision */ }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { struct sk_buff *head; struct sk_buff *tail; struct list_head bucketchain; int deficit; }; struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit */ struct list_head *hh_flows; /* table T (currently active HHs) */ u32 hh_flows_limit; /* max active HH allocs */ u32 hh_flows_overlimit; /* num of disallowed HH allocs */ u32 hh_flows_total_cnt; /* total admitted HHs */ u32 hh_flows_current_cnt; /* total current HHs */ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays * was reset */ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits * of hhf_arrays */ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ struct list_head new_buckets; /* list of new buckets */ struct list_head old_buckets; /* list of old buckets */ /* Configurable HHF parameters */ u32 hhf_reset_timeout; /* interval to reset counter * arrays in filter F * (default 40ms) */ u32 hhf_admit_bytes; /* counter thresh to classify as * HH (default 128KB). * With these default values, * 128KB / 40ms = 25 Mbps * i.e., we expect to capture HHs * sending > 25 Mbps. */ u32 hhf_evict_timeout; /* aging threshold to evict idle * HHs out of table T. This should * be large enough to avoid * reordering during HH eviction. * (default 1s) */ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs * (default 2, * i.e., non-HH : HH = 2 : 1) */ }; static u32 hhf_time_stamp(void) { return jiffies; } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow, *next; u32 now = hhf_time_stamp(); if (list_empty(head)) return NULL; list_for_each_entry_safe(flow, next, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) { /* Delete expired heavy-hitters, but preserve one entry * to avoid kzalloc() when next time this slot is hit. */ if (list_is_last(&flow->flowchain, head)) return NULL; list_del(&flow->flowchain); kfree(flow); q->hh_flows_current_cnt--; } else if (flow->hash_id == hash) { return flow; } } return NULL; } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow; u32 now = hhf_time_stamp(); if (!list_empty(head)) { /* Find an expired heavy-hitter flow entry. */ list_for_each_entry(flow, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) return flow; } } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { q->hh_flows_overlimit++; return NULL; } /* Create new entry. */ flow = kzalloc_obj(struct hh_flow_state, GFP_ATOMIC); if (!flow) return NULL; q->hh_flows_current_cnt++; INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); return flow; } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); u32 tmp_hash, hash; u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; struct hh_flow_state *flow; u32 pkt_len, min_hhf_val; int i; u32 prev; u32 now = hhf_time_stamp(); /* Reset the HHF counter arrays if this is the right time. */ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; if (hhf_time_before(prev, now)) { for (i = 0; i < HHF_ARRAYS_CNT; i++) bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); q->hhf_arrays_reset_timestamp = now; } /* Get hashed flow-id of the skb. */ hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; flow = seek_list(hash, &q->hh_flows[flow_pos], q); if (flow) { /* found its HH flow */ flow->hit_timestamp = now; return WDRR_BUCKET_FOR_HH; } /* Now pass the packet through the multi-stage filter. */ tmp_hash = hash; xorsum = 0; for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { /* Split the skb_hash into three 10-bit chunks. */ filter_pos[i] = tmp_hash & HHF_BIT_MASK; xorsum ^= filter_pos[i]; tmp_hash >>= HHF_BIT_MASK_LEN; } /* The last chunk is computed as XOR sum of other chunks. */ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; pkt_len = qdisc_pkt_len(skb); min_hhf_val = ~0U; for (i = 0; i < HHF_ARRAYS_CNT; i++) { u32 val; if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { q->hhf_arrays[i][filter_pos[i]] = 0; __set_bit(filter_pos[i], q->hhf_valid_bits[i]); } val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; if (min_hhf_val > val) min_hhf_val = val; } /* Found a new HH iff all counter values > HH admit threshold. */ if (min_hhf_val > q->hhf_admit_bytes) { /* Just captured a new heavy-hitter. */ flow = alloc_new_hh(&q->hh_flows[flow_pos], q); if (!flow) /* memory alloc problem */ return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; q->hh_flows_total_cnt++; /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). */ return WDRR_BUCKET_FOR_HH; } /* Conservative update of HHF arrays (see Optimization O2). */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; } return WDRR_BUCKET_FOR_NON_HH; } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { struct sk_buff *skb = bucket->head; bucket->head = skb->next; skb_mark_not_on_list(skb); return skb; } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { if (bucket->head == NULL) bucket->head = skb; else bucket->tail->next = skb; bucket->tail = skb; skb->next = NULL; } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; /* Always try to drop from heavy-hitters first. */ bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; if (!bucket->head) bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; if (bucket->head) { struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; unsigned int prev_backlog; idx = hhf_classify(skb, sch); bucket = &q->buckets[idx]; bucket_add(bucket, skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; /* The logic of new_buckets vs. old_buckets is the same as * new_flows vs. old_flows in the implementation of fq_codel, * i.e., short bursts of non-HHs should have strict priority. */ if (idx == WDRR_BUCKET_FOR_HH) { /* Always move heavy-hitters to old bucket. */ weight = 1; list_add_tail(&bucket->bucketchain, &q->old_buckets); } else { weight = q->hhf_non_hh_weight; list_add_tail(&bucket->bucketchain, &q->new_buckets); } bucket->deficit = weight * q->quantum; } if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. */ if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct wdrr_bucket *bucket; struct list_head *head; begin: head = &q->new_buckets; if (list_empty(head)) { head = &q->old_buckets; if (list_empty(head)) return NULL; } bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); if (bucket->deficit <= 0) { int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? 1 : q->hhf_non_hh_weight; bucket->deficit += weight * q->quantum; list_move_tail(&bucket->bucketchain, &q->old_buckets); goto begin; } if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { /* Force a pass through old_buckets to prevent starvation. */ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) list_move_tail(&bucket->bucketchain, &q->old_buckets); else list_del_init(&bucket->bucketchain); goto begin; } qdisc_bstats_update(sch, skb); bucket->deficit -= qdisc_pkt_len(skb); return skb; } static void hhf_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { kvfree(q->hhf_arrays[i]); kvfree(q->hhf_valid_bits[i]); } if (!q->hh_flows) return; for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; if (list_empty(head)) continue; list_for_each_entry_safe(flow, next, head, flowchain) { list_del(&flow->flowchain); kfree(flow); } } kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); if (tb[TCA_HHF_NON_HH_WEIGHT]) new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); WRITE_ONCE(q->quantum, new_quantum); WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) WRITE_ONCE(q->hh_flows_limit, nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); WRITE_ONCE(q->hhf_reset_timeout, usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) WRITE_ONCE(q->hhf_admit_bytes, nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); WRITE_ONCE(q->hhf_evict_timeout, usecs_to_jiffies(us)); } while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; dropped_pkts++; dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); /* Configurable HHF parameters */ q->hhf_reset_timeout = HZ / 25; /* 40 ms */ q->hhf_admit_bytes = 131072; /* 128 KB */ q->hhf_evict_timeout = HZ; /* 1 sec */ q->hhf_non_hh_weight = 2; if (opt) { int err = hhf_change(sch, opt, extack); if (err) return err; } if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ q->hh_flows = kvzalloc_objs(struct list_head, HH_FLOWS_CNT); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) INIT_LIST_HEAD(&q->hh_flows[i]); /* Cap max active HHs at twice len of hh_flows table. */ q->hh_flows_limit = 2 * HH_FLOWS_CNT; q->hh_flows_overlimit = 0; q->hh_flows_total_cnt = 0; q->hh_flows_current_cnt = 0; /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } q->hhf_arrays_reset_timestamp = hhf_time_stamp(); /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } /* Initialize Weighted DRR buckets. */ for (i = 0; i < WDRR_BUCKET_CNT; i++) { struct wdrr_bucket *bucket = q->buckets + i; INIT_LIST_HEAD(&bucket->bucketchain); } } return 0; } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { .drop_overlimit = q->drop_overlimit, .hh_overlimit = q->hh_flows_overlimit, .hh_tot_count = q->hh_flows_total_cnt, .hh_cur_count = q->hh_flows_current_cnt, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .id = "hhf", .priv_size = sizeof(struct hhf_sched_data), .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, .change = hhf_change, .dump = hhf_dump, .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { return register_qdisc(&hhf_qdisc_ops); } static void __exit hhf_module_exit(void) { unregister_qdisc(&hhf_qdisc_ops); } module_init(hhf_module_init) module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/cmdline.c * Helper functions generally used for parsing kernel command line * and module options. * * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c. * * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> /* * If a hyphen was found in get_option, this will handle the * range of numbers, M-N. This will expand the range and insert * the values[M, M+1, ..., N] into the ints array in get_options. */ static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } /** * get_option - Parse integer from an option string * @str: option string * @pint: (optional output) integer value parsed from @str * * Read an int from an option string; if available accept a subsequent * comma as well. * * When @pint is NULL the function can be used as a validator of * the current option in the string. * * Return values: * 0 - no int in string * 1 - int found, no subsequent comma * 2 - int found including a subsequent comma * 3 - hyphen found to denote a range * * Leading hyphen without integer is no integer case, but we consume it * for the sake of simplification. */ int get_option(char **str, int *pint) { char *cur = *str; int value; if (!cur || !(*cur)) return 0; if (*cur == '-') value = -simple_strtoull(++cur, str, 0); else value = simple_strtoull(cur, str, 0); if (pint) *pint = value; if (cur == *str) return 0; if (**str == ',') { (*str)++; return 2; } if (**str == '-') return 3; return 1; } EXPORT_SYMBOL(get_option); /** * get_options - Parse a string into a list of integers * @str: String to be parsed * @nints: size of integer array * @ints: integer array (must have room for at least one element) * * This function parses a string containing a comma-separated * list of integers, a hyphen-separated range of _positive_ integers, * or a combination of both. The parse halts when the array is * full, or when no more numbers can be retrieved from the * string. * * When @nints is 0, the function just validates the given @str and * returns the amount of parseable integers as described below. * * Returns: * * The first element is filled by the number of collected integers * in the range. The rest is what was parsed from the @str. * * Return value is the character in the string which caused * the parse to end (typically a null terminator, if @str is * completely parseable). */ char *get_options(const char *str, int nints, int *ints) { bool validate = (nints == 0); int res, i = 1; while (i < nints || validate) { int *pint = validate ? ints : ints + i; res = get_option((char **)&str, pint); if (res == 0) break; if (res == 3) { int n = validate ? 0 : nints - i; int range_nums; range_nums = get_range((char **)&str, pint, n); if (range_nums < 0) break; /* * Decrement the result by one to leave out the * last number in the range. The next iteration * will handle the upper number in the range */ i += (range_nums - 1); } i++; if (res == 1) break; } ints[0] = i - 1; return (char *)str; } EXPORT_SYMBOL(get_options); /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { case 'E': case 'e': ret <<= 10; fallthrough; case 'P': case 'p': ret <<= 10; fallthrough; case 'T': case 't': ret <<= 10; fallthrough; case 'G': case 'g': ret <<= 10; fallthrough; case 'M': case 'm': ret <<= 10; fallthrough; case 'K': case 'k': ret <<= 10; endptr++; fallthrough; default: break; } if (retptr) *retptr = endptr; return ret; } EXPORT_SYMBOL(memparse); /** * parse_option_str - Parse a string and check an option is set or not * @str: String to be parsed * @option: option name * * This function parses a string containing a comma-separated list of * strings like a=b,c. * * Return true if there's such option in the string, or return false. */ bool parse_option_str(const char *str, const char *option) { while (*str) { if (!strncmp(str, option, strlen(option))) { str += strlen(option); if (!*str || *str == ',') return true; } while (*str && *str != ',') str++; if (*str == ',') str++; } return false; } /* * Parse a string to get a param value pair. * You can use " around spaces, but can't escape ". * Hyphens and underscores equivalent in parameter names. */ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; if (*args == '"') { args++; in_quote = 1; quoted = 1; } for (i = 0; args[i]; i++) { if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; args += i + 1; } else args += i; /* Chew up trailing spaces. */ return skip_spaces(args); } EXPORT_SYMBOL(next_arg);
4 4 4 4 3 3 14 14 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the bitmap:port type */ #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <net/netlink.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #include <linux/netfilter/ipset/ip_set_getport.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ /* 2 Comment support added */ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); #define MTYPE bitmap_port /* Type structure */ struct bitmap_port { unsigned long *members; /* the set members */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[] /* data extensions */ __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_port_adt_elem { u16 id; }; static u16 port_to_id(const struct bitmap_port *m, u16 port) { return port - m->first_port; } /* Common functions */ static int bitmap_port_do_test(const struct bitmap_port_adt_elem *e, const struct bitmap_port *map, size_t dsize) { return !!test_bit(e->id, map->members); } static int bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize) { return !!test_bit(id, map->members); } static int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { return !!test_bit(e->id, map->members); } static int bitmap_port_do_del(const struct bitmap_port_adt_elem *e, struct bitmap_port *map) { return !test_and_clear_bit(e->id, map->members); } static int bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id, size_t dsize) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port + id)); } static int bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map) { return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) || nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); } static bool ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) { bool ret; u8 proto; switch (pf) { case NFPROTO_IPV4: ret = ip_set_get_ip4_port(skb, src, port, &proto); break; case NFPROTO_IPV6: ret = ip_set_get_ip6_port(skb, src, port, &proto); break; default: return false; } if (!ret) return ret; switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: return true; default: return false; } } static int bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_port_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be16 __port; u16 port = 0; if (!ip_set_get_ip_port(skb, opt->family, opt->flags & IPSET_DIM_ONE_SRC, &__port)) return -EINVAL; port = ntohs(__port); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; e.id = port_to_id(map, port); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_port_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port; /* wraparound */ u16 port_to; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (adt == IPSET_TEST) { e.id = port_to_id(map, port); return adtfn(set, &e, &ext, &ext, flags); } if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) { swap(port, port_to); if (port < map->first_port) return -IPSET_ERR_BITMAP_RANGE; } } else { port_to = port; } if (port_to > map->last_port) return -IPSET_ERR_BITMAP_RANGE; for (; port <= port_to; port++) { e.id = port_to_id(map, port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static bool bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) { const struct bitmap_port *x = a->data; const struct bitmap_port *y = b->data; return x->first_port == y->first_port && x->last_port == y->last_port && a->timeout == b->timeout && a->extensions == b->extensions; } /* Plain variant */ struct bitmap_port_elem { }; #include "ip_set_bitmap_gen.h" /* Create bitmap:ip type of sets */ static bool init_map_port(struct ip_set *set, struct bitmap_port *map, u16 first_port, u16 last_port) { map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_port = first_port; map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; map->set = set; set->data = map; set->family = NFPROTO_UNSPEC; return true; } static int bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { struct bitmap_port *map; u16 first_port, last_port; u32 elements; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (first_port > last_port) swap(first_port, last_port); elements = last_port - first_port + 1; set->dsize = ip_set_elem_len(set, tb, 0, 0); map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->elements = elements; map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); bitmap_port_gc_init(set, bitmap_port_gc); } return 0; } static struct ip_set_type bitmap_port_type = { .name = "bitmap:port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_PORT, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_port_create, .create_policy = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init bitmap_port_init(void) { return ip_set_type_register(&bitmap_port_type); } static void __exit bitmap_port_fini(void) { rcu_barrier(); ip_set_type_unregister(&bitmap_port_type); } module_init(bitmap_port_init); module_exit(bitmap_port_fini);
18 17 18 16 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/syscalls.h> #include <linux/time_namespace.h> #include "futex.h" /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. * * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after * acquiring the lock, but just before it could have added itself to * the list. There can only be one such pending lock. */ /** * sys_set_robust_list() - Set the robust-futex list head of a task * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { /* * The kernel knows only one size for now: */ if (unlikely(len != sizeof(*head))) return -EINVAL; current->robust_list = head; return 0; } static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat) { #ifdef CONFIG_COMPAT if (compat) return p->compat_robust_list; #endif return p->robust_list; } static void __user *futex_get_robust_list_common(int pid, bool compat) { struct task_struct *p = current; void __user *head; int ret; scoped_guard(rcu) { if (pid) { p = find_task_by_vpid(pid); if (!p) return (void __user *)ERR_PTR(-ESRCH); } get_task_struct(p); } /* * Hold exec_update_lock to serialize with concurrent exec() * so ptrace_may_access() is checked against stable credentials */ ret = down_read_killable(&p->signal->exec_update_lock); if (ret) goto err_put; ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = futex_task_robust_list(p, compat); up_read(&p->signal->exec_update_lock); put_task_struct(p); return head; err_unlock: up_read(&p->signal->exec_update_lock); err_put: put_task_struct(p); return (void __user *)ERR_PTR(ret); } /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { struct robust_list_head __user *head = futex_get_robust_list_common(pid, false); if (IS_ERR(head)) return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; if (flags & FLAGS_CLOCKRT) { if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: flags |= FLAGS_CLOCKRT; fallthrough; case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } static __always_inline bool futex_cmd_has_timeout(u32 cmd) { switch (cmd) { case FUTEX_WAIT: case FUTEX_LOCK_PI: case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: return true; } return false; } static __always_inline int futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) { if (!timespec64_valid(ts)) return -EINVAL; *t = timespec64_to_ktime(*ts); if (cmd == FUTEX_WAIT) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } /** * futex_parse_waitv - Parse a waitv array from userspace * @futexv: Kernel side list of waiters to be filled * @uwaitv: Userspace list to be parsed * @nr_futexes: Length of futexv * @wake: Wake to call when futex is woken * @wake_data: Data for the wake handler * * Return: Error code on failure, 0 on success */ int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data) { struct futex_waitv aux; unsigned int i; for (i = 0; i < nr_futexes; i++) { unsigned int flags; if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; flags = futex2_to_flags(aux.flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, aux.val)) return -EINVAL; futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; futexv[i].q.wake = wake; futexv[i].q.wake_data = wake_data; } return 0; } static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, clockid_t clockid, struct hrtimer_sleeper *to) { int flag_clkid = 0, flag_init = 0; struct timespec64 ts; ktime_t time; int ret; if (!timeout) return 0; if (clockid == CLOCK_REALTIME) { flag_clkid = FLAGS_CLOCKRT; flag_init = FUTEX_CLOCK_REALTIME; } if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return -EINVAL; if (get_timespec64(&ts, timeout)) return -EFAULT; /* * Since there's no opcode for futex_waitv, use * FUTEX_WAIT_BITSET that uses absolute timeout as well */ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); if (ret) return ret; futex_setup_timer(&time, to, flag_clkid, 0); return 0; } static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on * @nr_futexes: Length of futexv * @flags: Flag for timeout (monotonic/realtime) * @timeout: Optional absolute timeout. * @clockid: Clock to be used for the timeout, realtime or monotonic. * * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes * if a futex_wake() is performed at any uaddr. The syscall returns immediately * if any waiter has *uaddr != val. *timeout is an optional timeout value for * the operation. Each waiter has individual flags. The `flags` argument for * the syscall should be used solely for specifying the timeout as realtime, if * needed. Flags for private futexes, sizes, etc. should be used on the * individual flags of each waiter. * * Returns the array index of one of the woken futexes. No further information * is provided: any number of other futexes may also have been woken by the * same event, and if more than one futex was woken, the retrned index may * refer to any one of them. (It is not necessaryily the futex with the * smallest index, nor the one most recently woken, nor...) */ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, unsigned int, nr_futexes, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; struct futex_vector *futexv; int ret; /* This syscall supports no flags for now */ if (flags) return -EINVAL; if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; futexv = kzalloc_objs(*futexv, nr_futexes); if (!futexv) { ret = -ENOMEM; goto destroy_timer; } ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark, NULL); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); kfree(futexv); destroy_timer: if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_wake - Wake a number of futexes * @uaddr: Address of the futex(es) to wake * @mask: bitmask * @nr: Number of the futexes to wake * @flags: FUTEX2 flags * * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_wake, void __user *, uaddr, unsigned long, mask, int, nr, unsigned int, flags) { if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, mask)) return -EINVAL; return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } /* * sys_futex_wait - Wait on a futex * @uaddr: Address of the futex to wait on * @val: Value of @uaddr * @mask: bitmask * @flags: FUTEX2 flags * @timeout: Optional absolute timeout * @clockid: Clock to be used for the timeout, realtime or monotonic * * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the * futex2 familiy of calls. */ SYSCALL_DEFINE6(futex_wait, void __user *, uaddr, unsigned long, val, unsigned long, mask, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; int ret; if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, val) || !futex_validate_input(flags, mask)) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_requeue - Requeue a waiter from one futex to another * @waiters: array describing the source and destination futex * @flags: unused * @nr_wake: number of futexes to wake * @nr_requeue: number of futexes to requeue * * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_requeue, struct futex_waitv __user *, waiters, unsigned int, flags, int, nr_wake, int, nr_requeue) { struct futex_vector futexes[2]; u32 cmpval; int ret; if (flags) return -EINVAL; if (!waiters) return -EINVAL; ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL); if (ret) return ret; /* * For now mandate both flags are identical, like the sys_futex() * interface has. If/when we merge the variable sized futex support, * that patch can modify this test to allow a difference in size. */ if (futexes[0].w.flags != futexes[1].w.flags) return -EINVAL; cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, nr_wake, nr_requeue, &cmpval, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { if (unlikely(len != sizeof(*head))) return -EINVAL; current->compat_robust_list = head; return 0; } COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true); if (IS_ERR(head)) return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */
1 1 1 10 1 2 10 1 1 1 7 3 4 1 3 1 1 1 1 1 1 10 10 10 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_etf.c Earliest TxTime First queueing discipline. * * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> * Vinicius Costa Gomes <vinicius.gomes@intel.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/rbtree.h> #include <linux/skbuff.h> #include <linux/posix-timers.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/sock.h> #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) #define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ struct rb_root_cached head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, }; static inline int validate_input_params(struct tc_etf_qopt *qopt, struct netlink_ext_ack *extack) { /* Check if params comply to the following rules: * * Clockid and delta must be valid. * * * Dynamic clockids are not supported. * * * Delta must be a positive integer. * * Also note that for the HW offload case, we must * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); return -ENOTSUPP; } if (qopt->clockid != CLOCK_TAI) { NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); return -EINVAL; } if (qopt->delta < 0) { NL_SET_ERR_MSG(extack, "Delta must be positive"); return -EINVAL; } return 0; } static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) { struct etf_sched_data *q = qdisc_priv(sch); ktime_t txtime = nskb->tstamp; struct sock *sk = nskb->sk; ktime_t now; if (q->skip_sock_check) goto skip; if (!sk || !sk_fullsock(sk)) return false; if (!sock_flag(sk, SOCK_TXTIME)) return false; /* We don't perform crosstimestamping. * Drop if packet's clockid differs from qdisc's. */ if (sk->sk_clockid != q->clockid) return false; if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; return true; } static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; p = rb_first_cached(&q->head); if (!p) return NULL; return rb_to_skb(p); } static void reset_watchdog(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; if (!skb) { qdisc_watchdog_cancel(&q->watchdog); return; } next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) { struct sock_exterr_skb *serr; struct sk_buff *clone; ktime_t txtime = skb->tstamp; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors)) return; clone = skb_clone(skb, GFP_ATOMIC); if (!clone) return; serr = SKB_EXT_ERR(clone); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; serr->ee.ee_type = 0; serr->ee.ee_code = code; serr->ee.ee_pad = 0; serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ serr->ee.ee_info = txtime; /* low part of tstamp */ if (sock_queue_err_skb(sk, clone)) kfree_skb(clone); } static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; bool leftmost = true; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); } while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (ktime_compare(txtime, skb->tstamp) >= 0) { p = &parent->rb_right; leftmost = false; } else { p = &parent->rb_left; } } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return NET_XMIT_SUCCESS; } static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb, ktime_t now) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *to_free = NULL; struct sk_buff *tmp = NULL; skb_rbtree_walk_from_safe(skb, tmp) { if (ktime_after(skb->tstamp, now)) break; rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, &to_free); qdisc_qstats_overlimit(sch); sch->q.qlen--; } kfree_skb_list(to_free); } static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); q->last = skb->tstamp; sch->q.qlen--; } static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; ktime_t now, next; skb = etf_peek_timesortedlist(sch); if (!skb) return NULL; now = q->get_time(); /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { timesortedlist_drop(sch, skb, now); skb = NULL; goto out; } /* When in deadline mode, dequeue as soon as possible and change the * txtime from deadline to (now + delta). */ if (q->deadline_mode) { timesortedlist_remove(sch, skb); skb->tstamp = now; goto out; } next = ktime_sub_ns(skb->tstamp, q->delta); /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) timesortedlist_remove(sch, skb); else skb = NULL; out: /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return skb; } static void etf_disable_offload(struct net_device *dev, struct etf_sched_data *q) { struct tc_etf_qopt_offload etf = { }; const struct net_device_ops *ops; int err; if (!q->offload) return; ops = dev->netdev_ops; if (!ops->ndo_setup_tc) return; etf.queue = q->queue; etf.enable = 0; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) pr_warn("Couldn't disable ETF offload for queue %d\n", etf.queue); } static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_etf_qopt_offload etf = { }; int err; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); return -EOPNOTSUPP; } etf.queue = q->queue; etf.enable = 1; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) { NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); return err; } return 0; } static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct nlattr *tb[TCA_ETF_MAX + 1]; struct tc_etf_qopt *qopt; int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing ETF qdisc options which are mandatory"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy, extack); if (err < 0) return err; if (!tb[TCA_ETF_PARMS]) { NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); return -EINVAL; } qopt = nla_data(tb[TCA_ETF_PARMS]); pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); if (err < 0) return err; q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); if (OFFLOAD_IS_ON(qopt)) { err = etf_enable_offload(dev, q, extack); if (err < 0) return err; } /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: q->get_time = ktime_get_real; break; case CLOCK_MONOTONIC: q->get_time = ktime_get; break; case CLOCK_BOOTTIME: q->get_time = ktime_get_boottime; break; case CLOCK_TAI: q->get_time = ktime_get_clocktai; break; default: NL_SET_ERR_MSG(extack, "Clockid is not supported"); return -ENOTSUPP; } qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); return 0; } static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first_cached(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase_cached(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } } static void etf_reset(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); /* No matter which mode we are on, it's safe to clear both lists. */ timesortedlist_clear(sch); __qdisc_reset_queue(&sch->q); q->last = 0; } static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); struct tc_etf_qopt opt = { }; struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; opt.delta = READ_ONCE(q->delta); opt.clockid = READ_ONCE(q->clockid); if (READ_ONCE(q->offload)) opt.flags |= TC_ETF_OFFLOAD_ON; if (READ_ONCE(q->deadline_mode)) opt.flags |= TC_ETF_DEADLINE_MODE_ON; if (READ_ONCE(q->skip_sock_check)) opt.flags |= TC_ETF_SKIP_SOCK_CHECK; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct Qdisc_ops etf_qdisc_ops __read_mostly = { .id = "etf", .priv_size = sizeof(struct etf_sched_data), .enqueue = etf_enqueue_timesortedlist, .dequeue = etf_dequeue_timesortedlist, .peek = etf_peek_timesortedlist, .init = etf_init, .reset = etf_reset, .destroy = etf_destroy, .dump = etf_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("etf"); static int __init etf_module_init(void) { return register_qdisc(&etf_qdisc_ops); } static void __exit etf_module_exit(void) { unregister_qdisc(&etf_qdisc_ops); } module_init(etf_module_init) module_exit(etf_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Earliest TxTime First (ETF) qdisc");
1585 96 633 776 74 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for atomic bit * operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #include <linux/instrumented.h> /** * set_bit - Atomically set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void set_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_set_bit(nr, addr); } /** * clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). */ static __always_inline void clear_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_clear_bit(nr, addr); } /** * change_bit - Toggle a bit in memory * @nr: Bit to change * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void change_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_change_bit(nr, addr); } /** * test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit(nr, addr); } /** * test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_clear_bit(nr, addr); } /** * test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_change_bit(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
2 17 16 15 15 15 3 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0 /* Bluetooth HCI driver model support. */ #include <linux/module.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> static const struct class bt_class = { .name = "bluetooth", }; static void bt_link_release(struct device *dev) { struct hci_conn *conn = to_hci_conn(dev); kfree(conn); } static const struct device_type bt_link = { .name = "link", .release = bt_link_release, }; void hci_conn_init_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "conn %p", conn); conn->dev.type = &bt_link; conn->dev.class = &bt_class; conn->dev.parent = &hdev->dev; device_initialize(&conn->dev); } void hci_conn_add_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "conn %p", conn); if (device_is_registered(&conn->dev)) return; dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); if (device_add(&conn->dev) < 0) bt_dev_err(hdev, "failed to register connection device"); } void hci_conn_del_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "conn %p", conn); if (!device_is_registered(&conn->dev)) { /* If device_add() has *not* succeeded, use *only* put_device() * to drop the reference count. */ put_device(&conn->dev); return; } /* If there are devices using the connection as parent reset it to NULL * before unregistering the device. */ while (1) { struct device *dev; dev = device_find_any_child(&conn->dev); if (!dev) break; device_move(dev, NULL, DPM_ORDER_DEV_LAST); put_device(dev); } device_unregister(&conn->dev); } static void bt_host_release(struct device *dev) { struct hci_dev *hdev = to_hci_dev(dev); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) hci_release_dev(hdev); else kfree(hdev); module_put(THIS_MODULE); } static ssize_t reset_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hci_dev *hdev = to_hci_dev(dev); if (hdev->reset) hdev->reset(hdev); return count; } static DEVICE_ATTR_WO(reset); static struct attribute *bt_host_attrs[] = { &dev_attr_reset.attr, NULL, }; ATTRIBUTE_GROUPS(bt_host); static const struct device_type bt_host = { .name = "host", .release = bt_host_release, .groups = bt_host_groups, }; void hci_init_sysfs(struct hci_dev *hdev) { struct device *dev = &hdev->dev; dev->type = &bt_host; dev->class = &bt_class; __module_get(THIS_MODULE); device_initialize(dev); } int __init bt_sysfs_init(void) { return class_register(&bt_class); } void bt_sysfs_cleanup(void) { class_unregister(&bt_class); }
9 9 9 14 2 2 3 7 2 2 2 3 1 3 1 1 1 23 1 5 2 2 3 2 2 4 4 2 9 9 7 8 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/in.h> #include <linux/ipv6.h> #include <linux/poll.h> #include <net/sock.h> #include "rds.h" /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); static unsigned long rds_sock_count; static LIST_HEAD(rds_sock_list); DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); /* * This is called as the final descriptor referencing this socket is closed. * We have to unbind the socket so that another socket can be bound to the * address it was using. * * We have to be careful about racing with the incoming path. sock_orphan() * sets SOCK_DEAD and we use that as an indicator to the rx path that new * messages shouldn't be queued. */ static int rds_release(struct socket *sock) { struct sock *sk = sock->sk; struct rds_sock *rs; if (!sk) goto out; rs = rds_sk_to_rs(sk); sock_orphan(sk); /* Note - rds_clear_recv_queue grabs rs_recv_lock, so * that ensures the recv path has completed messing * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); rds_remove_bound(rs); rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); rds_sock_count--; spin_unlock_bh(&rds_sock_lock); rds_trans_put(rs->rs_transport); sock->sk = NULL; sock_put(sk); out: return 0; } /* * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but * this seems more conservative. * NB - normally, one would use sk_callback_lock for this, but we can * get here from interrupts, whereas the network code grabs sk_callback_lock * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. */ void rds_wake_sk_sleep(struct rds_sock *rs) { unsigned long flags; read_lock_irqsave(&rs->rs_recv_lock, flags); __rds_wake_sk_sleep(rds_rs_to_sk(rs)); read_unlock_irqrestore(&rs->rs_recv_lock, flags); } static int rds_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); struct sockaddr_in6 *sin6; struct sockaddr_in *sin; int uaddr_len; /* racey, don't care */ if (peer) { if (ipv6_addr_any(&rs->rs_conn_addr)) return -ENOTCONN; if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { sin = (struct sockaddr_in *)uaddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); sin->sin_family = AF_INET; sin->sin_port = rs->rs_conn_port; sin->sin_addr.s_addr = rs->rs_conn_addr_v4; uaddr_len = sizeof(*sin); } else { sin6 = (struct sockaddr_in6 *)uaddr; sin6->sin6_family = AF_INET6; sin6->sin6_port = rs->rs_conn_port; sin6->sin6_addr = rs->rs_conn_addr; sin6->sin6_flowinfo = 0; /* scope_id is the same as in the bound address. */ sin6->sin6_scope_id = rs->rs_bound_scope_id; uaddr_len = sizeof(*sin6); } } else { /* If socket is not yet bound and the socket is connected, * set the return address family to be the same as the * connected address, but with 0 address value. If it is not * connected, set the family to be AF_UNSPEC (value 0) and * the address size to be that of an IPv4 address. */ if (ipv6_addr_any(&rs->rs_bound_addr)) { if (ipv6_addr_any(&rs->rs_conn_addr)) { sin = (struct sockaddr_in *)uaddr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_UNSPEC; return sizeof(*sin); } #if IS_ENABLED(CONFIG_IPV6) if (!(ipv6_addr_type(&rs->rs_conn_addr) & IPV6_ADDR_MAPPED)) { sin6 = (struct sockaddr_in6 *)uaddr; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; return sizeof(*sin6); } #endif sin = (struct sockaddr_in *)uaddr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; return sizeof(*sin); } if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { sin = (struct sockaddr_in *)uaddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); sin->sin_family = AF_INET; sin->sin_port = rs->rs_bound_port; sin->sin_addr.s_addr = rs->rs_bound_addr_v4; uaddr_len = sizeof(*sin); } else { sin6 = (struct sockaddr_in6 *)uaddr; sin6->sin6_family = AF_INET6; sin6->sin6_port = rs->rs_bound_port; sin6->sin6_addr = rs->rs_bound_addr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = rs->rs_bound_scope_id; uaddr_len = sizeof(*sin6); } } return uaddr_len; } /* * RDS' poll is without a doubt the least intuitive part of the interface, * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from * a network protocol. * * EPOLLIN is asserted if * - there is data on the receive queue. * - to signal that a previously congested destination may have become * uncongested * - A notification has been queued to the socket (this can be a congestion * update, or a RDMA completion, or a MSG_ZEROCOPY completion). * * EPOLLOUT is asserted if there is room on the send queue. This does not mean * however, that the next sendmsg() call will succeed. If the application tries * to send to a congested destination, the system call may still fail (and * return ENOBUFS). */ static __poll_t rds_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); __poll_t mask = 0; unsigned long flags; poll_wait(file, sk_sleep(sk), wait); if (rs->rs_seen_congestion) poll_wait(file, &rds_poll_waitq, wait); read_lock_irqsave(&rs->rs_recv_lock, flags); if (!rs->rs_cong_monitor) { /* When a congestion map was updated, we signal EPOLLIN for * "historical" reasons. Applications can also poll for * WRBAND instead. */ if (rds_cong_updated_since(&rs->rs_cong_track)) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); } else { spin_lock(&rs->rs_lock); if (rs->rs_cong_notify) mask |= (EPOLLIN | EPOLLRDNORM); spin_unlock(&rs->rs_lock); } if (!list_empty(&rs->rs_recv_queue) || !list_empty(&rs->rs_notify_queue) || !list_empty(&rs->rs_zcookie_queue.zcookie_head)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= EPOLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ if (mask) rs->rs_seen_congestion = 0; return mask; } static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); rds_tos_t utos, tos = 0; switch (cmd) { case SIOCRDSSETTOS: if (get_user(utos, (rds_tos_t __user *)arg)) return -EFAULT; if (rs->rs_transport && rs->rs_transport->get_tos_map) tos = rs->rs_transport->get_tos_map(utos); else return -ENOIOCTLCMD; spin_lock_bh(&rds_sock_lock); if (rs->rs_tos || rs->rs_conn) { spin_unlock_bh(&rds_sock_lock); return -EINVAL; } rs->rs_tos = tos; spin_unlock_bh(&rds_sock_lock); break; case SIOCRDSGETTOS: spin_lock_bh(&rds_sock_lock); tos = rs->rs_tos; spin_unlock_bh(&rds_sock_lock); if (put_user(tos, (rds_tos_t __user *)arg)) return -EFAULT; break; default: return -ENOIOCTLCMD; } return 0; } static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) { struct sockaddr_in6 sin6; struct sockaddr_in sin; int ret = 0; /* racing with another thread binding seems ok here */ if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (len < sizeof(struct sockaddr_in)) { ret = -EINVAL; goto out; } else if (len < sizeof(struct sockaddr_in6)) { /* Assume IPv4 */ if (copy_from_sockptr(&sin, optval, sizeof(struct sockaddr_in))) { ret = -EFAULT; goto out; } ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); sin6.sin6_port = sin.sin_port; } else { if (copy_from_sockptr(&sin6, optval, sizeof(struct sockaddr_in6))) { ret = -EFAULT; goto out; } } rds_send_drop_to(rs, &sin6); out: return ret; } static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, int optlen) { int value; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&value, optval, sizeof(int))) return -EFAULT; *optvar = !!value; return 0; } static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) { int ret; ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); if (ret == 0) { if (rs->rs_cong_monitor) { rds_cong_add_socket(rs); } else { rds_cong_remove_socket(rs); rs->rs_cong_mask = 0; rs->rs_cong_notify = 0; } } return ret; } static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen) { int t_type; if (rs->rs_transport) return -EOPNOTSUPP; /* previously attached to transport */ if (optlen != sizeof(int)) return -EINVAL; if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) return -EFAULT; if (t_type < 0 || t_type >= RDS_TRANS_COUNT) return -EINVAL; rs->rs_transport = rds_trans_get(t_type); return rs->rs_transport ? 0 : -ENOPROTOOPT; } static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, int optlen, int optname) { int val, valbool; if (optlen != sizeof(int)) return -EFAULT; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; if (optname == SO_TIMESTAMP_NEW) sock_set_flag(sk, SOCK_TSTAMP_NEW); if (valbool) sock_set_flag(sk, SOCK_RCVTSTAMP); else sock_reset_flag(sk, SOCK_RCVTSTAMP); return 0; } static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_rx_trace_so trace; int i; if (optlen != sizeof(struct rds_rx_trace_so)) return -EFAULT; if (copy_from_sockptr(&trace, optval, sizeof(trace))) return -EFAULT; if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) return -EFAULT; rs->rs_rx_traces = trace.rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { rs->rs_rx_traces = 0; return -EFAULT; } rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; } return 0; } static int rds_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret; if (level != SOL_RDS) { ret = -ENOPROTOOPT; goto out; } switch (optname) { case RDS_CANCEL_SENT_TO: ret = rds_cancel_sent_to(rs, optval, optlen); break; case RDS_GET_MR: ret = rds_get_mr(rs, optval, optlen); break; case RDS_GET_MR_FOR_DEST: ret = rds_get_mr_for_dest(rs, optval, optlen); break; case RDS_FREE_MR: ret = rds_free_mr(rs, optval, optlen); break; case RDS_RECVERR: ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); break; case RDS_CONG_MONITOR: ret = rds_cong_monitor(rs, optval, optlen); break; case SO_RDS_TRANSPORT: lock_sock(sock->sk); ret = rds_set_transport(rs, optval, optlen); release_sock(sock->sk); break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: lock_sock(sock->sk); ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); release_sock(sock->sk); break; case SO_RDS_MSG_RXPATH_LATENCY: ret = rds_recv_track_latency(rs, optval, optlen); break; default: ret = -ENOPROTOOPT; } out: return ret; } static int rds_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret = -ENOPROTOOPT, len; int trans; if (level != SOL_RDS) goto out; if (get_user(len, optlen)) { ret = -EFAULT; goto out; } switch (optname) { case RDS_INFO_FIRST ... RDS_INFO_LAST: ret = rds_info_getsockopt(sock, optname, optval, optlen); break; case RDS_RECVERR: if (len < sizeof(int)) ret = -EINVAL; else if (put_user(rs->rs_recverr, (int __user *) optval) || put_user(sizeof(int), optlen)) ret = -EFAULT; else ret = 0; break; case SO_RDS_TRANSPORT: if (len < sizeof(int)) { ret = -EINVAL; break; } trans = (rs->rs_transport ? rs->rs_transport->t_type : RDS_TRANS_NONE); /* unbound */ if (put_user(trans, (int __user *)optval) || put_user(sizeof(int), optlen)) ret = -EFAULT; else ret = 0; break; default: break; } out: return ret; } static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_in *sin; struct rds_sock *rs = rds_sk_to_rs(sk); int ret = 0; if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; lock_sock(sk); switch (uaddr->sa_family) { case AF_INET: sin = (struct sockaddr_in *)uaddr; if (addr_len < sizeof(struct sockaddr_in)) { ret = -EINVAL; break; } if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { ret = -EDESTADDRREQ; break; } if (ipv4_is_multicast(sin->sin_addr.s_addr) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { ret = -EINVAL; break; } ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); rs->rs_conn_port = sin->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6; int addr_type; sin6 = (struct sockaddr_in6 *)uaddr; if (addr_len < sizeof(struct sockaddr_in6)) { ret = -EINVAL; break; } addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) { ret = -EPROTOTYPE; break; } /* It is a mapped address. Need to do some sanity * checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) { ret = -EPROTOTYPE; break; } } if (addr_type & IPV6_ADDR_LINKLOCAL) { /* If socket is already bound to a link local address, * the peer address must be on the same link. */ if (sin6->sin6_scope_id == 0 || (!ipv6_addr_any(&rs->rs_bound_addr) && rs->rs_bound_scope_id && sin6->sin6_scope_id != rs->rs_bound_scope_id)) { ret = -EINVAL; break; } /* Remember the connected address scope ID. It will * be checked against the binding local address when * the socket is bound. */ rs->rs_bound_scope_id = sin6->sin6_scope_id; } rs->rs_conn_addr = sin6->sin6_addr; rs->rs_conn_port = sin6->sin6_port; break; } #endif default: ret = -EAFNOSUPPORT; break; } release_sock(sk); return ret; } static struct proto rds_proto = { .name = "RDS", .owner = THIS_MODULE, .obj_size = sizeof(struct rds_sock), }; static const struct proto_ops rds_proto_ops = { .family = AF_RDS, .owner = THIS_MODULE, .release = rds_release, .bind = rds_bind, .connect = rds_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = rds_getname, .poll = rds_poll, .ioctl = rds_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = rds_setsockopt, .getsockopt = rds_getsockopt, .sendmsg = rds_sendmsg, .recvmsg = rds_recvmsg, .mmap = sock_no_mmap, }; static void rds_sock_destruct(struct sock *sk) { struct rds_sock *rs = rds_sk_to_rs(sk); WARN_ON((&rs->rs_item != rs->rs_item.next || &rs->rs_item != rs->rs_item.prev)); } static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { struct rds_sock *rs; sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); rwlock_init(&rs->rs_recv_lock); INIT_LIST_HEAD(&rs->rs_send_queue); INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; rs->rs_tos = 0; rs->rs_conn = NULL; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); rds_sock_count++; spin_unlock_bh(&rds_sock_lock); return 0; } static int rds_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); if (!sk) return -ENOMEM; return __rds_create(sock, sk, protocol); } void rds_sock_addref(struct rds_sock *rs) { sock_hold(rds_rs_to_sk(rs)); } void rds_sock_put(struct rds_sock *rs) { sock_put(rds_rs_to_sk(rs)); } static const struct net_proto_family rds_family_ops = { .family = AF_RDS, .create = rds_create, .owner = THIS_MODULE, }; static void rds_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_sock *rs; struct rds_incoming *inc; unsigned int total = 0; len /= sizeof(struct rds_info_message); spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { /* This option only supports IPv4 sockets. */ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) continue; read_lock(&rs->rs_recv_lock); /* XXX too lazy to maintain counts.. */ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) rds_inc_info_copy(inc, iter, inc->i_saddr.s6_addr32[3], rs->rs_bound_addr_v4, 1); } read_unlock(&rs->rs_recv_lock); } spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds_info_message); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_incoming *inc; unsigned int total = 0; struct rds_sock *rs; len /= sizeof(struct rds6_info_message); spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { read_lock(&rs->rs_recv_lock); list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) rds6_inc_info_copy(inc, iter, &inc->i_saddr, &rs->rs_bound_addr, 1); } read_unlock(&rs->rs_recv_lock); } spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds6_info_message); } #endif static void rds_sock_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_info_socket sinfo; unsigned int cnt = 0; struct rds_sock *rs; len /= sizeof(struct rds_info_socket); spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) { cnt = rds_sock_count; goto out; } list_for_each_entry(rs, &rds_sock_list, rs_item) { /* This option only supports IPv4 sockets. */ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) continue; sinfo.sndbuf = rds_sk_sndbuf(rs); sinfo.rcvbuf = rds_sk_rcvbuf(rs); sinfo.bound_addr = rs->rs_bound_addr_v4; sinfo.connected_addr = rs->rs_conn_addr_v4; sinfo.bound_port = rs->rs_bound_port; sinfo.connected_port = rs->rs_conn_port; sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); rds_info_copy(iter, &sinfo, sizeof(sinfo)); cnt++; } out: lens->nr = cnt; lens->each = sizeof(struct rds_info_socket); spin_unlock_bh(&rds_sock_lock); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_sock_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds6_info_socket sinfo6; struct rds_sock *rs; len /= sizeof(struct rds6_info_socket); spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) goto out; list_for_each_entry(rs, &rds_sock_list, rs_item) { sinfo6.sndbuf = rds_sk_sndbuf(rs); sinfo6.rcvbuf = rds_sk_rcvbuf(rs); sinfo6.bound_addr = rs->rs_bound_addr; sinfo6.connected_addr = rs->rs_conn_addr; sinfo6.bound_port = rs->rs_bound_port; sinfo6.connected_port = rs->rs_conn_port; sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); } out: lens->nr = rds_sock_count; lens->each = sizeof(struct rds6_info_socket); spin_unlock_bh(&rds_sock_lock); } #endif static void rds_exit(void) { sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); rds_conn_exit(); rds_cong_exit(); rds_sysctl_exit(); rds_threads_exit(); rds_stats_exit(); rds_page_exit(); rds_bind_lock_destroy(); rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); #endif } module_exit(rds_exit); u32 rds_gen_num; static int __init rds_init(void) { int ret; net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); ret = rds_bind_lock_init(); if (ret) goto out; ret = rds_conn_init(); if (ret) goto out_bind; ret = rds_threads_init(); if (ret) goto out_conn; ret = rds_sysctl_init(); if (ret) goto out_threads; ret = rds_stats_init(); if (ret) goto out_sysctl; ret = proto_register(&rds_proto, 1); if (ret) goto out_stats; ret = sock_register(&rds_family_ops); if (ret) goto out_proto; rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); #endif goto out; out_proto: proto_unregister(&rds_proto); out_stats: rds_stats_exit(); out_sysctl: rds_sysctl_exit(); out_threads: rds_threads_exit(); out_conn: rds_conn_exit(); rds_cong_exit(); rds_page_exit(); out_bind: rds_bind_lock_destroy(); out: return ret; } module_init(rds_init); #define DRV_VERSION "4.0" #define DRV_RELDATE "Feb 12, 2009" MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" " v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_VERSION(DRV_VERSION); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NETPROTO(PF_RDS);
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2012 VMware, Inc. All rights reserved. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/stddef.h> #include <net/sock.h> #include <net/vsock_addr.h> void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port) { memset(addr, 0, sizeof(*addr)); addr->svm_family = AF_VSOCK; addr->svm_cid = cid; addr->svm_port = port; } EXPORT_SYMBOL_GPL(vsock_addr_init); int vsock_addr_validate(const struct sockaddr_vm *addr) { __u8 svm_valid_flags = VMADDR_FLAG_TO_HOST; if (!addr) return -EFAULT; if (addr->svm_family != AF_VSOCK) return -EAFNOSUPPORT; if (addr->svm_flags & ~svm_valid_flags) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(vsock_addr_validate); bool vsock_addr_bound(const struct sockaddr_vm *addr) { return addr->svm_port != VMADDR_PORT_ANY; } EXPORT_SYMBOL_GPL(vsock_addr_bound); void vsock_addr_unbind(struct sockaddr_vm *addr) { vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); } EXPORT_SYMBOL_GPL(vsock_addr_unbind); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other) { return addr->svm_cid == other->svm_cid && addr->svm_port == other->svm_port; } EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr) { if (len < sizeof(**out_addr)) return -EFAULT; *out_addr = (struct sockaddr_vm *)addr; return vsock_addr_validate(*out_addr); } EXPORT_SYMBOL_GPL(vsock_addr_cast);
3486 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2005-2010 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: evm.h */ #ifndef __INTEGRITY_EVM_H #define __INTEGRITY_EVM_H #include <linux/xattr.h> #include <linux/security.h> #include "../integrity.h" #define EVM_INIT_HMAC 0x0001 #define EVM_INIT_X509 0x0002 #define EVM_ALLOW_METADATA_WRITES 0x0004 #define EVM_SETUP_COMPLETE 0x80000000 /* userland has signaled key load */ #define EVM_KEY_MASK (EVM_INIT_HMAC | EVM_INIT_X509) #define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP_COMPLETE | \ EVM_ALLOW_METADATA_WRITES) struct xattr_list { struct list_head list; char *name; bool enabled; }; #define EVM_NEW_FILE 0x00000001 #define EVM_IMMUTABLE_DIGSIG 0x00000002 /* EVM integrity metadata associated with an inode */ struct evm_iint_cache { unsigned long flags; enum integrity_status evm_status:4; struct integrity_inode_attributes metadata_inode; }; extern struct lsm_blob_sizes evm_blob_sizes; static inline struct evm_iint_cache *evm_iint_inode(const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + evm_blob_sizes.lbs_inode; } extern int evm_initialized; #define EVM_ATTR_FSUUID 0x0001 extern int evm_hmac_attrs; /* List of EVM protected security xattrs */ extern struct list_head evm_config_xattrnames; struct evm_digest { struct ima_digest_data_hdr hdr; char digest[IMA_MAX_DIGEST_SIZE]; } __packed; int evm_protected_xattr(const char *req_xattr_name); int evm_init_key(void); int evm_update_evmxattr(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len); int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len, struct evm_digest *data, struct evm_iint_cache *iint); int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name, const char *req_xattr_value, size_t req_xattr_value_len, char type, struct evm_digest *data, struct evm_iint_cache *iint); int evm_init_hmac(struct inode *inode, const struct xattr *xattrs, char *hmac_val); int evm_init_secfs(void); #endif
4 5 3 1125 1125 8 8 8 8 8 8 77 2 69 6 44 3 2 3 2 2 3 3 2 42 4 3 4 3 4 3 5 2 34 13 2 5 10 3 12 13 2 37 8 4 5 7 7 5 6 6 16 4 2 3 3 5 1 18 2 2 2 2 2 2 17 3 2 3 2 3 2 17 3 2 3 2 3 2 23 35 7 3 2 1 31 5 3 1 1 36 58 1 47 4 43 1 37 5 1 2 1 1 1 1 1 37 1 1 33 1 1 1 1 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/can.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned char max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, byte, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct canfd_frame and; struct canfd_frame or; struct canfd_frame xor; struct canfd_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct canfd_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod __rcu *cf_mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); } static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); } static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); } static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) { memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } /* retrieve valid CC DLC value and store it into 'len' */ static void mod_retrieve_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* len8_dlc is only valid if len == CAN_MAX_DLEN */ if (ccf->len != CAN_MAX_DLEN) return; /* do we have a valid len8_dlc value from 9 .. 15 ? */ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) ccf->len = ccf->len8_dlc; } /* convert valid CC DLC value in 'len' into struct can_frame elements */ static void mod_store_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* clear potential leftovers */ ccf->len8_dlc = 0; /* plain data length 0 .. 8 - that was easy */ if (ccf->len <= CAN_MAX_DLEN) return; /* potentially broken values are caught in can_can_gw_rcv() */ if (ccf->len > CAN_MAX_RAW_DLC) return; /* we have a valid dlc value from 9 .. 15 in ccf->len */ ccf->len8_dlc = ccf->len; ccf->len = CAN_MAX_DLEN; } static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_and_len(cf, mod); mod_store_ccdlc(cf); } static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_or_len(cf, mod); mod_store_ccdlc(cf); } static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_xor_len(cf, mod); mod_store_ccdlc(cf); } static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_set_len(cf, mod); mod_store_ccdlc(cf); } static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 2 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->flags = src->flags; dst->len = src->len; memcpy(dst->data, src->data, CANFD_MAX_DLEN); } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) { s8 dlen = CAN_MAX_DLEN; if (r->flags & CGW_FLAGS_CAN_FD) dlen = CANFD_MAX_DLEN; /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr >= -dlen && fr < dlen && to >= -dlen && to < dlen && re >= -dlen && re < dlen) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_len) { if (idx < 0) return rx_len + idx; else return idx; } static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->len); int to = calc_idx(xor->to_idx, cf->len); int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = from; i >= to; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[res] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; struct can_skb_ext *csx, *ncsx; struct cf_mod *mod; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } csx = can_skb_ext_find(skb); if (!csx) return; /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). */ if (csx->can_gw_hops >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && csx->can_iif == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ mod = rcu_dereference(gwj->cf_mod); if (mod->modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* the cloned/copied nskb points to the skb extension of the original * skb with an increased refcount. skb_ext_add() creates a copy to * separate the skb extension data to modify the can_gw_hops. */ ncsx = skb_ext_add(nskb, SKB_EXT_CAN); if (!ncsx) { kfree_skb(nskb); gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ ncsx->can_gw_hops = csx->can_gw_hops + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && ncsx->can_gw_hops == 1) ncsx->can_gw_hops = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx]) (*mod->modfunc[modidx++])(cf, mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->len > max_len) { /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } /* check for checksum updates */ if (mod->csumfunc.crc8) (*mod->csumfunc.crc8)(cf, &mod->csum.crc8); if (mod->csumfunc.xor) (*mod->csumfunc.xor)(cf, &mod->csum.xor); } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); /* cgw_job::cf_mod is always accessed from the same cgw_job object within * the same RCU read section. Once cgw_job is scheduled for removal, * cf_mod can also be removed without mandating an additional grace period. */ kfree(rcu_access_pointer(gwj->cf_mod)); kmem_cache_free(cgw_cache, gwj); } /* Return cgw_job::cf_mod with RTNL protected section */ static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj) { return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked()); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; struct cf_mod *mod; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } mod = cgw_job_cf_mod(gwj); if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (mod->modtype.and) { memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.or) { memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.xor) { memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.set) { memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; if (mod->modtype.and) { memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.or) { memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.xor) { memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (mod->modtype.set) { memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } if (mod->uid) { if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0) goto cancel; } if (mod->csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &mod->csum.crc8) < 0) goto cancel; } if (mod->csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &mod->csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX + 1]; struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (r->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (tb[CGW_FDMOD_AND]) { nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_and_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_and_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_fddata; } if (tb[CGW_FDMOD_OR]) { nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_or_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_or_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_fddata; } if (tb[CGW_FDMOD_XOR]) { nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_xor_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_xor_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_fddata; } if (tb[CGW_FDMOD_SET]) { nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_set_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_set_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_fddata; } } else { struct cgw_frame_mod mb; if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod *mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; mod = kmalloc_obj(*mod); if (!mod) return -ENOMEM; err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) goto out_free_cf; if (mod->uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { struct cf_mod *old_cf; old_cf = cgw_job_cf_mod(gwj); if (old_cf->uid != mod->uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) { err = -EINVAL; goto out_free_cf; } rcu_assign_pointer(gwj->cf_mod, mod); kfree_rcu_mightsleep(old_cf); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) { err = -ENODEV; goto out_free_cf; } gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) { err = -ENOMEM; goto out_free_cf; } gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ RCU_INIT_POINTER(gwj->cf_mod, mod); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; /* is sending the skb back to the incoming interface intended? */ if (gwj->src.dev == gwj->dst.dev && !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { err = -EINVAL; goto out; } ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) { kmem_cache_free(cgw_cache, gwj); out_free_cf: kfree(mod); } return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { struct cf_mod *cf_mod; if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; cf_mod = cgw_job_cf_mod(gwj); /* we have a match when uid is enabled and identical */ if (cf_mod->uid || mod.uid) { if (cf_mod->uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(cf_mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit_batch = cangw_pernet_exit_batch, }; static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE, .doit = cgw_create_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE, .doit = cgw_remove_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE, .dumpit = cgw_dump_jobs}, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway - max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(&notifier); if (ret) goto out_register_notifier; ret = rtnl_register_many(cgw_rtnl_msg_handlers); if (ret) goto out_rtnl_register; return 0; out_rtnl_register: unregister_netdevice_notifier(&notifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(&notifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit);
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM scsi #if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCSI_H #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> #include <linux/tracepoint.h> #include <linux/trace_seq.h> #define scsi_opcode_name(opcode) { opcode, #opcode } #define show_opcode_name(val) \ __print_symbolic(val, \ scsi_opcode_name(TEST_UNIT_READY), \ scsi_opcode_name(REZERO_UNIT), \ scsi_opcode_name(REQUEST_SENSE), \ scsi_opcode_name(FORMAT_UNIT), \ scsi_opcode_name(READ_BLOCK_LIMITS), \ scsi_opcode_name(REASSIGN_BLOCKS), \ scsi_opcode_name(INITIALIZE_ELEMENT_STATUS), \ scsi_opcode_name(READ_6), \ scsi_opcode_name(WRITE_6), \ scsi_opcode_name(SEEK_6), \ scsi_opcode_name(READ_REVERSE), \ scsi_opcode_name(WRITE_FILEMARKS), \ scsi_opcode_name(SPACE), \ scsi_opcode_name(INQUIRY), \ scsi_opcode_name(RECOVER_BUFFERED_DATA), \ scsi_opcode_name(MODE_SELECT), \ scsi_opcode_name(RESERVE_6), \ scsi_opcode_name(RELEASE_6), \ scsi_opcode_name(COPY), \ scsi_opcode_name(ERASE), \ scsi_opcode_name(MODE_SENSE), \ scsi_opcode_name(START_STOP), \ scsi_opcode_name(RECEIVE_DIAGNOSTIC), \ scsi_opcode_name(SEND_DIAGNOSTIC), \ scsi_opcode_name(ALLOW_MEDIUM_REMOVAL), \ scsi_opcode_name(SET_WINDOW), \ scsi_opcode_name(READ_CAPACITY), \ scsi_opcode_name(READ_10), \ scsi_opcode_name(WRITE_10), \ scsi_opcode_name(SEEK_10), \ scsi_opcode_name(POSITION_TO_ELEMENT), \ scsi_opcode_name(WRITE_VERIFY), \ scsi_opcode_name(VERIFY), \ scsi_opcode_name(SEARCH_HIGH), \ scsi_opcode_name(SEARCH_EQUAL), \ scsi_opcode_name(SEARCH_LOW), \ scsi_opcode_name(SET_LIMITS), \ scsi_opcode_name(PRE_FETCH), \ scsi_opcode_name(READ_POSITION), \ scsi_opcode_name(SYNCHRONIZE_CACHE), \ scsi_opcode_name(LOCK_UNLOCK_CACHE), \ scsi_opcode_name(READ_DEFECT_DATA), \ scsi_opcode_name(MEDIUM_SCAN), \ scsi_opcode_name(COMPARE), \ scsi_opcode_name(COPY_VERIFY), \ scsi_opcode_name(WRITE_BUFFER), \ scsi_opcode_name(READ_BUFFER), \ scsi_opcode_name(UPDATE_BLOCK), \ scsi_opcode_name(READ_LONG), \ scsi_opcode_name(WRITE_LONG), \ scsi_opcode_name(CHANGE_DEFINITION), \ scsi_opcode_name(WRITE_SAME), \ scsi_opcode_name(UNMAP), \ scsi_opcode_name(READ_TOC), \ scsi_opcode_name(LOG_SELECT), \ scsi_opcode_name(LOG_SENSE), \ scsi_opcode_name(XDWRITEREAD_10), \ scsi_opcode_name(MODE_SELECT_10), \ scsi_opcode_name(RESERVE_10), \ scsi_opcode_name(RELEASE_10), \ scsi_opcode_name(MODE_SENSE_10), \ scsi_opcode_name(PERSISTENT_RESERVE_IN), \ scsi_opcode_name(PERSISTENT_RESERVE_OUT), \ scsi_opcode_name(VARIABLE_LENGTH_CMD), \ scsi_opcode_name(REPORT_LUNS), \ scsi_opcode_name(MAINTENANCE_IN), \ scsi_opcode_name(MAINTENANCE_OUT), \ scsi_opcode_name(MOVE_MEDIUM), \ scsi_opcode_name(EXCHANGE_MEDIUM), \ scsi_opcode_name(READ_12), \ scsi_opcode_name(WRITE_12), \ scsi_opcode_name(WRITE_VERIFY_12), \ scsi_opcode_name(SEARCH_HIGH_12), \ scsi_opcode_name(SEARCH_EQUAL_12), \ scsi_opcode_name(SEARCH_LOW_12), \ scsi_opcode_name(READ_ELEMENT_STATUS), \ scsi_opcode_name(SEND_VOLUME_TAG), \ scsi_opcode_name(WRITE_LONG_2), \ scsi_opcode_name(READ_16), \ scsi_opcode_name(WRITE_16), \ scsi_opcode_name(VERIFY_16), \ scsi_opcode_name(WRITE_SAME_16), \ scsi_opcode_name(ZBC_OUT), \ scsi_opcode_name(ZBC_IN), \ scsi_opcode_name(SERVICE_ACTION_IN_16), \ scsi_opcode_name(READ_32), \ scsi_opcode_name(WRITE_32), \ scsi_opcode_name(WRITE_SAME_32), \ scsi_opcode_name(ATA_16), \ scsi_opcode_name(WRITE_ATOMIC_16), \ scsi_opcode_name(ATA_12)) #define scsi_hostbyte_name(result) { result, #result } #define show_hostbyte_name(val) \ __print_symbolic(val, \ scsi_hostbyte_name(DID_OK), \ scsi_hostbyte_name(DID_NO_CONNECT), \ scsi_hostbyte_name(DID_BUS_BUSY), \ scsi_hostbyte_name(DID_TIME_OUT), \ scsi_hostbyte_name(DID_BAD_TARGET), \ scsi_hostbyte_name(DID_ABORT), \ scsi_hostbyte_name(DID_PARITY), \ scsi_hostbyte_name(DID_ERROR), \ scsi_hostbyte_name(DID_RESET), \ scsi_hostbyte_name(DID_BAD_INTR), \ scsi_hostbyte_name(DID_PASSTHROUGH), \ scsi_hostbyte_name(DID_SOFT_ERROR), \ scsi_hostbyte_name(DID_IMM_RETRY), \ scsi_hostbyte_name(DID_REQUEUE), \ scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED), \ scsi_hostbyte_name(DID_TRANSPORT_FAILFAST)) #define scsi_statusbyte_name(result) { result, #result } #define show_statusbyte_name(val) \ __print_symbolic(val, \ scsi_statusbyte_name(SAM_STAT_GOOD), \ scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION), \ scsi_statusbyte_name(SAM_STAT_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_BUSY), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT), \ scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED), \ scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL), \ scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE), \ scsi_statusbyte_name(SAM_STAT_TASK_ABORTED)) #define scsi_prot_op_name(result) { result, #result } #define show_prot_op_name(val) \ __print_symbolic(val, \ scsi_prot_op_name(SCSI_PROT_NORMAL), \ scsi_prot_op_name(SCSI_PROT_READ_INSERT), \ scsi_prot_op_name(SCSI_PROT_WRITE_STRIP), \ scsi_prot_op_name(SCSI_PROT_READ_STRIP), \ scsi_prot_op_name(SCSI_PROT_WRITE_INSERT), \ scsi_prot_op_name(SCSI_PROT_READ_PASS), \ scsi_prot_op_name(SCSI_PROT_WRITE_PASS)) const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int); #define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len) TRACE_EVENT(scsi_dispatch_cmd_start, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len)) ); #define scsi_rtn_name(result) { result, #result } #define show_rtn_name(val) \ __print_symbolic(val, \ scsi_rtn_name(SCSI_MLQUEUE_HOST_BUSY), \ scsi_rtn_name(SCSI_MLQUEUE_DEVICE_BUSY), \ scsi_rtn_name(SCSI_MLQUEUE_EH_RETRY), \ scsi_rtn_name(SCSI_MLQUEUE_TARGET_BUSY)) TRACE_EVENT(scsi_dispatch_cmd_error, TP_PROTO(struct scsi_cmnd *cmd, int rtn), TP_ARGS(cmd, rtn), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, rtn ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->rtn = rtn; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)" \ " rtn=%s", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), show_rtn_name(__entry->rtn) ) ); DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, result ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) __field( u8, sense_key ) __field( u8, asc ) __field( u8, ascq ) ), TP_fast_assign( struct scsi_sense_hdr sshdr; __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->result = cmd->result; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); if (cmd->sense_buffer && SCSI_SENSE_VALID(cmd) && scsi_command_normalize_sense(cmd, &sshdr)) { __entry->sense_key = sshdr.sense_key; __entry->asc = sshdr.asc; __entry->ascq = sshdr.ascq; } else { __entry->sense_key = 0; __entry->asc = 0; __entry->ascq = 0; } ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u " \ "prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s) " \ "result=(driver=%s host=%s message=%s status=%s) " "sense=(key=%#x asc=%#x ascq=%#x)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), "DRIVER_OK", show_hostbyte_name(((__entry->result) >> 16) & 0xff), "COMMAND_COMPLETE", show_statusbyte_name(__entry->result & 0xff), __entry->sense_key, __entry->asc, __entry->ascq) ); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); TRACE_EVENT(scsi_eh_wakeup, TP_PROTO(struct Scsi_Host *shost), TP_ARGS(shost), TP_STRUCT__entry( __field( unsigned int, host_no ) ), TP_fast_assign( __entry->host_no = shost->host_no; ), TP_printk("host_no=%u", __entry->host_no) ); #endif /* _TRACE_SCSI_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 4 2 4 2 4 2 1 8 8 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for assembler optimized version of Camellia * * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * Camellia parts based on code by: * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) */ #include <linux/unaligned.h> #include <linux/crypto.h> #include <linux/export.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> #include "camellia.h" #include "ecb_cbc_helpers.h" /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src, bool xor); EXPORT_SYMBOL_GPL(__camellia_enc_blk); asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_dec_blk); /* 2-way parallel cipher functions */ asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src, bool xor); EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way); asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_dec_blk_2way); static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src); } /* camellia sboxes */ __visible const u64 camellia_sp10011110[256] = { 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL, 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL, 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL, 0x8500008585858500ULL, 0x5700005757575700ULL, 0x3500003535353500ULL, 0xea0000eaeaeaea00ULL, 0x0c00000c0c0c0c00ULL, 0xae0000aeaeaeae00ULL, 0x4100004141414100ULL, 0x2300002323232300ULL, 0xef0000efefefef00ULL, 0x6b00006b6b6b6b00ULL, 0x9300009393939300ULL, 0x4500004545454500ULL, 0x1900001919191900ULL, 0xa50000a5a5a5a500ULL, 0x2100002121212100ULL, 0xed0000edededed00ULL, 0x0e00000e0e0e0e00ULL, 0x4f00004f4f4f4f00ULL, 0x4e00004e4e4e4e00ULL, 0x1d00001d1d1d1d00ULL, 0x6500006565656500ULL, 0x9200009292929200ULL, 0xbd0000bdbdbdbd00ULL, 0x8600008686868600ULL, 0xb80000b8b8b8b800ULL, 0xaf0000afafafaf00ULL, 0x8f00008f8f8f8f00ULL, 0x7c00007c7c7c7c00ULL, 0xeb0000ebebebeb00ULL, 0x1f00001f1f1f1f00ULL, 0xce0000cececece00ULL, 0x3e00003e3e3e3e00ULL, 0x3000003030303000ULL, 0xdc0000dcdcdcdc00ULL, 0x5f00005f5f5f5f00ULL, 0x5e00005e5e5e5e00ULL, 0xc50000c5c5c5c500ULL, 0x0b00000b0b0b0b00ULL, 0x1a00001a1a1a1a00ULL, 0xa60000a6a6a6a600ULL, 0xe10000e1e1e1e100ULL, 0x3900003939393900ULL, 0xca0000cacacaca00ULL, 0xd50000d5d5d5d500ULL, 0x4700004747474700ULL, 0x5d00005d5d5d5d00ULL, 0x3d00003d3d3d3d00ULL, 0xd90000d9d9d9d900ULL, 0x0100000101010100ULL, 0x5a00005a5a5a5a00ULL, 0xd60000d6d6d6d600ULL, 0x5100005151515100ULL, 0x5600005656565600ULL, 0x6c00006c6c6c6c00ULL, 0x4d00004d4d4d4d00ULL, 0x8b00008b8b8b8b00ULL, 0x0d00000d0d0d0d00ULL, 0x9a00009a9a9a9a00ULL, 0x6600006666666600ULL, 0xfb0000fbfbfbfb00ULL, 0xcc0000cccccccc00ULL, 0xb00000b0b0b0b000ULL, 0x2d00002d2d2d2d00ULL, 0x7400007474747400ULL, 0x1200001212121200ULL, 0x2b00002b2b2b2b00ULL, 0x2000002020202000ULL, 0xf00000f0f0f0f000ULL, 0xb10000b1b1b1b100ULL, 0x8400008484848400ULL, 0x9900009999999900ULL, 0xdf0000dfdfdfdf00ULL, 0x4c00004c4c4c4c00ULL, 0xcb0000cbcbcbcb00ULL, 0xc20000c2c2c2c200ULL, 0x3400003434343400ULL, 0x7e00007e7e7e7e00ULL, 0x7600007676767600ULL, 0x0500000505050500ULL, 0x6d00006d6d6d6d00ULL, 0xb70000b7b7b7b700ULL, 0xa90000a9a9a9a900ULL, 0x3100003131313100ULL, 0xd10000d1d1d1d100ULL, 0x1700001717171700ULL, 0x0400000404040400ULL, 0xd70000d7d7d7d700ULL, 0x1400001414141400ULL, 0x5800005858585800ULL, 0x3a00003a3a3a3a00ULL, 0x6100006161616100ULL, 0xde0000dededede00ULL, 0x1b00001b1b1b1b00ULL, 0x1100001111111100ULL, 0x1c00001c1c1c1c00ULL, 0x3200003232323200ULL, 0x0f00000f0f0f0f00ULL, 0x9c00009c9c9c9c00ULL, 0x1600001616161600ULL, 0x5300005353535300ULL, 0x1800001818181800ULL, 0xf20000f2f2f2f200ULL, 0x2200002222222200ULL, 0xfe0000fefefefe00ULL, 0x4400004444444400ULL, 0xcf0000cfcfcfcf00ULL, 0xb20000b2b2b2b200ULL, 0xc30000c3c3c3c300ULL, 0xb50000b5b5b5b500ULL, 0x7a00007a7a7a7a00ULL, 0x9100009191919100ULL, 0x2400002424242400ULL, 0x0800000808080800ULL, 0xe80000e8e8e8e800ULL, 0xa80000a8a8a8a800ULL, 0x6000006060606000ULL, 0xfc0000fcfcfcfc00ULL, 0x6900006969696900ULL, 0x5000005050505000ULL, 0xaa0000aaaaaaaa00ULL, 0xd00000d0d0d0d000ULL, 0xa00000a0a0a0a000ULL, 0x7d00007d7d7d7d00ULL, 0xa10000a1a1a1a100ULL, 0x8900008989898900ULL, 0x6200006262626200ULL, 0x9700009797979700ULL, 0x5400005454545400ULL, 0x5b00005b5b5b5b00ULL, 0x1e00001e1e1e1e00ULL, 0x9500009595959500ULL, 0xe00000e0e0e0e000ULL, 0xff0000ffffffff00ULL, 0x6400006464646400ULL, 0xd20000d2d2d2d200ULL, 0x1000001010101000ULL, 0xc40000c4c4c4c400ULL, 0x0000000000000000ULL, 0x4800004848484800ULL, 0xa30000a3a3a3a300ULL, 0xf70000f7f7f7f700ULL, 0x7500007575757500ULL, 0xdb0000dbdbdbdb00ULL, 0x8a00008a8a8a8a00ULL, 0x0300000303030300ULL, 0xe60000e6e6e6e600ULL, 0xda0000dadadada00ULL, 0x0900000909090900ULL, 0x3f00003f3f3f3f00ULL, 0xdd0000dddddddd00ULL, 0x9400009494949400ULL, 0x8700008787878700ULL, 0x5c00005c5c5c5c00ULL, 0x8300008383838300ULL, 0x0200000202020200ULL, 0xcd0000cdcdcdcd00ULL, 0x4a00004a4a4a4a00ULL, 0x9000009090909000ULL, 0x3300003333333300ULL, 0x7300007373737300ULL, 0x6700006767676700ULL, 0xf60000f6f6f6f600ULL, 0xf30000f3f3f3f300ULL, 0x9d00009d9d9d9d00ULL, 0x7f00007f7f7f7f00ULL, 0xbf0000bfbfbfbf00ULL, 0xe20000e2e2e2e200ULL, 0x5200005252525200ULL, 0x9b00009b9b9b9b00ULL, 0xd80000d8d8d8d800ULL, 0x2600002626262600ULL, 0xc80000c8c8c8c800ULL, 0x3700003737373700ULL, 0xc60000c6c6c6c600ULL, 0x3b00003b3b3b3b00ULL, 0x8100008181818100ULL, 0x9600009696969600ULL, 0x6f00006f6f6f6f00ULL, 0x4b00004b4b4b4b00ULL, 0x1300001313131300ULL, 0xbe0000bebebebe00ULL, 0x6300006363636300ULL, 0x2e00002e2e2e2e00ULL, 0xe90000e9e9e9e900ULL, 0x7900007979797900ULL, 0xa70000a7a7a7a700ULL, 0x8c00008c8c8c8c00ULL, 0x9f00009f9f9f9f00ULL, 0x6e00006e6e6e6e00ULL, 0xbc0000bcbcbcbc00ULL, 0x8e00008e8e8e8e00ULL, 0x2900002929292900ULL, 0xf50000f5f5f5f500ULL, 0xf90000f9f9f9f900ULL, 0xb60000b6b6b6b600ULL, 0x2f00002f2f2f2f00ULL, 0xfd0000fdfdfdfd00ULL, 0xb40000b4b4b4b400ULL, 0x5900005959595900ULL, 0x7800007878787800ULL, 0x9800009898989800ULL, 0x0600000606060600ULL, 0x6a00006a6a6a6a00ULL, 0xe70000e7e7e7e700ULL, 0x4600004646464600ULL, 0x7100007171717100ULL, 0xba0000babababa00ULL, 0xd40000d4d4d4d400ULL, 0x2500002525252500ULL, 0xab0000abababab00ULL, 0x4200004242424200ULL, 0x8800008888888800ULL, 0xa20000a2a2a2a200ULL, 0x8d00008d8d8d8d00ULL, 0xfa0000fafafafa00ULL, 0x7200007272727200ULL, 0x0700000707070700ULL, 0xb90000b9b9b9b900ULL, 0x5500005555555500ULL, 0xf80000f8f8f8f800ULL, 0xee0000eeeeeeee00ULL, 0xac0000acacacac00ULL, 0x0a00000a0a0a0a00ULL, 0x3600003636363600ULL, 0x4900004949494900ULL, 0x2a00002a2a2a2a00ULL, 0x6800006868686800ULL, 0x3c00003c3c3c3c00ULL, 0x3800003838383800ULL, 0xf10000f1f1f1f100ULL, 0xa40000a4a4a4a400ULL, 0x4000004040404000ULL, 0x2800002828282800ULL, 0xd30000d3d3d3d300ULL, 0x7b00007b7b7b7b00ULL, 0xbb0000bbbbbbbb00ULL, 0xc90000c9c9c9c900ULL, 0x4300004343434300ULL, 0xc10000c1c1c1c100ULL, 0x1500001515151500ULL, 0xe30000e3e3e3e300ULL, 0xad0000adadadad00ULL, 0xf40000f4f4f4f400ULL, 0x7700007777777700ULL, 0xc70000c7c7c7c700ULL, 0x8000008080808000ULL, 0x9e00009e9e9e9e00ULL, }; __visible const u64 camellia_sp22000222[256] = { 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL, 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL, 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL, 0x0b0b0000000b0b0bULL, 0xaeae000000aeaeaeULL, 0x6a6a0000006a6a6aULL, 0xd5d5000000d5d5d5ULL, 0x1818000000181818ULL, 0x5d5d0000005d5d5dULL, 0x8282000000828282ULL, 0x4646000000464646ULL, 0xdfdf000000dfdfdfULL, 0xd6d6000000d6d6d6ULL, 0x2727000000272727ULL, 0x8a8a0000008a8a8aULL, 0x3232000000323232ULL, 0x4b4b0000004b4b4bULL, 0x4242000000424242ULL, 0xdbdb000000dbdbdbULL, 0x1c1c0000001c1c1cULL, 0x9e9e0000009e9e9eULL, 0x9c9c0000009c9c9cULL, 0x3a3a0000003a3a3aULL, 0xcaca000000cacacaULL, 0x2525000000252525ULL, 0x7b7b0000007b7b7bULL, 0x0d0d0000000d0d0dULL, 0x7171000000717171ULL, 0x5f5f0000005f5f5fULL, 0x1f1f0000001f1f1fULL, 0xf8f8000000f8f8f8ULL, 0xd7d7000000d7d7d7ULL, 0x3e3e0000003e3e3eULL, 0x9d9d0000009d9d9dULL, 0x7c7c0000007c7c7cULL, 0x6060000000606060ULL, 0xb9b9000000b9b9b9ULL, 0xbebe000000bebebeULL, 0xbcbc000000bcbcbcULL, 0x8b8b0000008b8b8bULL, 0x1616000000161616ULL, 0x3434000000343434ULL, 0x4d4d0000004d4d4dULL, 0xc3c3000000c3c3c3ULL, 0x7272000000727272ULL, 0x9595000000959595ULL, 0xabab000000abababULL, 0x8e8e0000008e8e8eULL, 0xbaba000000bababaULL, 0x7a7a0000007a7a7aULL, 0xb3b3000000b3b3b3ULL, 0x0202000000020202ULL, 0xb4b4000000b4b4b4ULL, 0xadad000000adadadULL, 0xa2a2000000a2a2a2ULL, 0xacac000000acacacULL, 0xd8d8000000d8d8d8ULL, 0x9a9a0000009a9a9aULL, 0x1717000000171717ULL, 0x1a1a0000001a1a1aULL, 0x3535000000353535ULL, 0xcccc000000ccccccULL, 0xf7f7000000f7f7f7ULL, 0x9999000000999999ULL, 0x6161000000616161ULL, 0x5a5a0000005a5a5aULL, 0xe8e8000000e8e8e8ULL, 0x2424000000242424ULL, 0x5656000000565656ULL, 0x4040000000404040ULL, 0xe1e1000000e1e1e1ULL, 0x6363000000636363ULL, 0x0909000000090909ULL, 0x3333000000333333ULL, 0xbfbf000000bfbfbfULL, 0x9898000000989898ULL, 0x9797000000979797ULL, 0x8585000000858585ULL, 0x6868000000686868ULL, 0xfcfc000000fcfcfcULL, 0xecec000000ecececULL, 0x0a0a0000000a0a0aULL, 0xdada000000dadadaULL, 0x6f6f0000006f6f6fULL, 0x5353000000535353ULL, 0x6262000000626262ULL, 0xa3a3000000a3a3a3ULL, 0x2e2e0000002e2e2eULL, 0x0808000000080808ULL, 0xafaf000000afafafULL, 0x2828000000282828ULL, 0xb0b0000000b0b0b0ULL, 0x7474000000747474ULL, 0xc2c2000000c2c2c2ULL, 0xbdbd000000bdbdbdULL, 0x3636000000363636ULL, 0x2222000000222222ULL, 0x3838000000383838ULL, 0x6464000000646464ULL, 0x1e1e0000001e1e1eULL, 0x3939000000393939ULL, 0x2c2c0000002c2c2cULL, 0xa6a6000000a6a6a6ULL, 0x3030000000303030ULL, 0xe5e5000000e5e5e5ULL, 0x4444000000444444ULL, 0xfdfd000000fdfdfdULL, 0x8888000000888888ULL, 0x9f9f0000009f9f9fULL, 0x6565000000656565ULL, 0x8787000000878787ULL, 0x6b6b0000006b6b6bULL, 0xf4f4000000f4f4f4ULL, 0x2323000000232323ULL, 0x4848000000484848ULL, 0x1010000000101010ULL, 0xd1d1000000d1d1d1ULL, 0x5151000000515151ULL, 0xc0c0000000c0c0c0ULL, 0xf9f9000000f9f9f9ULL, 0xd2d2000000d2d2d2ULL, 0xa0a0000000a0a0a0ULL, 0x5555000000555555ULL, 0xa1a1000000a1a1a1ULL, 0x4141000000414141ULL, 0xfafa000000fafafaULL, 0x4343000000434343ULL, 0x1313000000131313ULL, 0xc4c4000000c4c4c4ULL, 0x2f2f0000002f2f2fULL, 0xa8a8000000a8a8a8ULL, 0xb6b6000000b6b6b6ULL, 0x3c3c0000003c3c3cULL, 0x2b2b0000002b2b2bULL, 0xc1c1000000c1c1c1ULL, 0xffff000000ffffffULL, 0xc8c8000000c8c8c8ULL, 0xa5a5000000a5a5a5ULL, 0x2020000000202020ULL, 0x8989000000898989ULL, 0x0000000000000000ULL, 0x9090000000909090ULL, 0x4747000000474747ULL, 0xefef000000efefefULL, 0xeaea000000eaeaeaULL, 0xb7b7000000b7b7b7ULL, 0x1515000000151515ULL, 0x0606000000060606ULL, 0xcdcd000000cdcdcdULL, 0xb5b5000000b5b5b5ULL, 0x1212000000121212ULL, 0x7e7e0000007e7e7eULL, 0xbbbb000000bbbbbbULL, 0x2929000000292929ULL, 0x0f0f0000000f0f0fULL, 0xb8b8000000b8b8b8ULL, 0x0707000000070707ULL, 0x0404000000040404ULL, 0x9b9b0000009b9b9bULL, 0x9494000000949494ULL, 0x2121000000212121ULL, 0x6666000000666666ULL, 0xe6e6000000e6e6e6ULL, 0xcece000000cececeULL, 0xeded000000edededULL, 0xe7e7000000e7e7e7ULL, 0x3b3b0000003b3b3bULL, 0xfefe000000fefefeULL, 0x7f7f0000007f7f7fULL, 0xc5c5000000c5c5c5ULL, 0xa4a4000000a4a4a4ULL, 0x3737000000373737ULL, 0xb1b1000000b1b1b1ULL, 0x4c4c0000004c4c4cULL, 0x9191000000919191ULL, 0x6e6e0000006e6e6eULL, 0x8d8d0000008d8d8dULL, 0x7676000000767676ULL, 0x0303000000030303ULL, 0x2d2d0000002d2d2dULL, 0xdede000000dededeULL, 0x9696000000969696ULL, 0x2626000000262626ULL, 0x7d7d0000007d7d7dULL, 0xc6c6000000c6c6c6ULL, 0x5c5c0000005c5c5cULL, 0xd3d3000000d3d3d3ULL, 0xf2f2000000f2f2f2ULL, 0x4f4f0000004f4f4fULL, 0x1919000000191919ULL, 0x3f3f0000003f3f3fULL, 0xdcdc000000dcdcdcULL, 0x7979000000797979ULL, 0x1d1d0000001d1d1dULL, 0x5252000000525252ULL, 0xebeb000000ebebebULL, 0xf3f3000000f3f3f3ULL, 0x6d6d0000006d6d6dULL, 0x5e5e0000005e5e5eULL, 0xfbfb000000fbfbfbULL, 0x6969000000696969ULL, 0xb2b2000000b2b2b2ULL, 0xf0f0000000f0f0f0ULL, 0x3131000000313131ULL, 0x0c0c0000000c0c0cULL, 0xd4d4000000d4d4d4ULL, 0xcfcf000000cfcfcfULL, 0x8c8c0000008c8c8cULL, 0xe2e2000000e2e2e2ULL, 0x7575000000757575ULL, 0xa9a9000000a9a9a9ULL, 0x4a4a0000004a4a4aULL, 0x5757000000575757ULL, 0x8484000000848484ULL, 0x1111000000111111ULL, 0x4545000000454545ULL, 0x1b1b0000001b1b1bULL, 0xf5f5000000f5f5f5ULL, 0xe4e4000000e4e4e4ULL, 0x0e0e0000000e0e0eULL, 0x7373000000737373ULL, 0xaaaa000000aaaaaaULL, 0xf1f1000000f1f1f1ULL, 0xdddd000000ddddddULL, 0x5959000000595959ULL, 0x1414000000141414ULL, 0x6c6c0000006c6c6cULL, 0x9292000000929292ULL, 0x5454000000545454ULL, 0xd0d0000000d0d0d0ULL, 0x7878000000787878ULL, 0x7070000000707070ULL, 0xe3e3000000e3e3e3ULL, 0x4949000000494949ULL, 0x8080000000808080ULL, 0x5050000000505050ULL, 0xa7a7000000a7a7a7ULL, 0xf6f6000000f6f6f6ULL, 0x7777000000777777ULL, 0x9393000000939393ULL, 0x8686000000868686ULL, 0x8383000000838383ULL, 0x2a2a0000002a2a2aULL, 0xc7c7000000c7c7c7ULL, 0x5b5b0000005b5b5bULL, 0xe9e9000000e9e9e9ULL, 0xeeee000000eeeeeeULL, 0x8f8f0000008f8f8fULL, 0x0101000000010101ULL, 0x3d3d0000003d3d3dULL, }; __visible const u64 camellia_sp03303033[256] = { 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL, 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL, 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL, 0x00c2c200c200c2c2ULL, 0x00abab00ab00ababULL, 0x009a9a009a009a9aULL, 0x0075750075007575ULL, 0x0006060006000606ULL, 0x0057570057005757ULL, 0x00a0a000a000a0a0ULL, 0x0091910091009191ULL, 0x00f7f700f700f7f7ULL, 0x00b5b500b500b5b5ULL, 0x00c9c900c900c9c9ULL, 0x00a2a200a200a2a2ULL, 0x008c8c008c008c8cULL, 0x00d2d200d200d2d2ULL, 0x0090900090009090ULL, 0x00f6f600f600f6f6ULL, 0x0007070007000707ULL, 0x00a7a700a700a7a7ULL, 0x0027270027002727ULL, 0x008e8e008e008e8eULL, 0x00b2b200b200b2b2ULL, 0x0049490049004949ULL, 0x00dede00de00dedeULL, 0x0043430043004343ULL, 0x005c5c005c005c5cULL, 0x00d7d700d700d7d7ULL, 0x00c7c700c700c7c7ULL, 0x003e3e003e003e3eULL, 0x00f5f500f500f5f5ULL, 0x008f8f008f008f8fULL, 0x0067670067006767ULL, 0x001f1f001f001f1fULL, 0x0018180018001818ULL, 0x006e6e006e006e6eULL, 0x00afaf00af00afafULL, 0x002f2f002f002f2fULL, 0x00e2e200e200e2e2ULL, 0x0085850085008585ULL, 0x000d0d000d000d0dULL, 0x0053530053005353ULL, 0x00f0f000f000f0f0ULL, 0x009c9c009c009c9cULL, 0x0065650065006565ULL, 0x00eaea00ea00eaeaULL, 0x00a3a300a300a3a3ULL, 0x00aeae00ae00aeaeULL, 0x009e9e009e009e9eULL, 0x00ecec00ec00ececULL, 0x0080800080008080ULL, 0x002d2d002d002d2dULL, 0x006b6b006b006b6bULL, 0x00a8a800a800a8a8ULL, 0x002b2b002b002b2bULL, 0x0036360036003636ULL, 0x00a6a600a600a6a6ULL, 0x00c5c500c500c5c5ULL, 0x0086860086008686ULL, 0x004d4d004d004d4dULL, 0x0033330033003333ULL, 0x00fdfd00fd00fdfdULL, 0x0066660066006666ULL, 0x0058580058005858ULL, 0x0096960096009696ULL, 0x003a3a003a003a3aULL, 0x0009090009000909ULL, 0x0095950095009595ULL, 0x0010100010001010ULL, 0x0078780078007878ULL, 0x00d8d800d800d8d8ULL, 0x0042420042004242ULL, 0x00cccc00cc00ccccULL, 0x00efef00ef00efefULL, 0x0026260026002626ULL, 0x00e5e500e500e5e5ULL, 0x0061610061006161ULL, 0x001a1a001a001a1aULL, 0x003f3f003f003f3fULL, 0x003b3b003b003b3bULL, 0x0082820082008282ULL, 0x00b6b600b600b6b6ULL, 0x00dbdb00db00dbdbULL, 0x00d4d400d400d4d4ULL, 0x0098980098009898ULL, 0x00e8e800e800e8e8ULL, 0x008b8b008b008b8bULL, 0x0002020002000202ULL, 0x00ebeb00eb00ebebULL, 0x000a0a000a000a0aULL, 0x002c2c002c002c2cULL, 0x001d1d001d001d1dULL, 0x00b0b000b000b0b0ULL, 0x006f6f006f006f6fULL, 0x008d8d008d008d8dULL, 0x0088880088008888ULL, 0x000e0e000e000e0eULL, 0x0019190019001919ULL, 0x0087870087008787ULL, 0x004e4e004e004e4eULL, 0x000b0b000b000b0bULL, 0x00a9a900a900a9a9ULL, 0x000c0c000c000c0cULL, 0x0079790079007979ULL, 0x0011110011001111ULL, 0x007f7f007f007f7fULL, 0x0022220022002222ULL, 0x00e7e700e700e7e7ULL, 0x0059590059005959ULL, 0x00e1e100e100e1e1ULL, 0x00dada00da00dadaULL, 0x003d3d003d003d3dULL, 0x00c8c800c800c8c8ULL, 0x0012120012001212ULL, 0x0004040004000404ULL, 0x0074740074007474ULL, 0x0054540054005454ULL, 0x0030300030003030ULL, 0x007e7e007e007e7eULL, 0x00b4b400b400b4b4ULL, 0x0028280028002828ULL, 0x0055550055005555ULL, 0x0068680068006868ULL, 0x0050500050005050ULL, 0x00bebe00be00bebeULL, 0x00d0d000d000d0d0ULL, 0x00c4c400c400c4c4ULL, 0x0031310031003131ULL, 0x00cbcb00cb00cbcbULL, 0x002a2a002a002a2aULL, 0x00adad00ad00adadULL, 0x000f0f000f000f0fULL, 0x00caca00ca00cacaULL, 0x0070700070007070ULL, 0x00ffff00ff00ffffULL, 0x0032320032003232ULL, 0x0069690069006969ULL,